From 4c3c6cade5eecee65c76d82a8996ffe406dd00b3 Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 08:01:49 -0400 Subject: [PATCH 1/6] Basic regex literal lexer support. --- src/lexer/Lexer.spec.ts | 22 ++++++++++++ src/lexer/Lexer.ts | 75 ++++++++++++++++++++++++++++++++++++----- src/lexer/TokenKind.ts | 1 + 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/src/lexer/Lexer.spec.ts b/src/lexer/Lexer.spec.ts index c12a6ba30..c5ac046fc 100644 --- a/src/lexer/Lexer.spec.ts +++ b/src/lexer/Lexer.spec.ts @@ -1222,4 +1222,26 @@ describe('lexer', () => { TokenKind.Eof ]); }); + + describe('regular expression literals', () => { + function testRegex(...regexps) { + const results = [] as string[]; + for (const regexp of regexps) { + const { tokens } = Lexer.scan(regexp); + results.push(tokens[0].text); + } + expect(results).to.eql(regexps); + } + + it('recognizes regex literals', () => { + testRegex( + '/simple/', + '/SimpleWithValidFlags/imsx', + '/UnknownFlags/VUI', + '/with spaces/andflags', + '/with(parens)and[squarebraces]/', + '/*()^$@/' + ); + }); + }); }); diff --git a/src/lexer/Lexer.ts b/src/lexer/Lexer.ts index 962f4ab1d..d10c2c95f 100644 --- a/src/lexer/Lexer.ts +++ b/src/lexer/Lexer.ts @@ -199,14 +199,17 @@ export class Lexer { } }, '/': function (this: Lexer) { - switch (this.peek()) { - case '=': - this.advance(); - this.addToken(TokenKind.ForwardslashEqual); - break; - default: - this.addToken(TokenKind.Forwardslash); - break; + //try capturing a regex literal. If that doesn't work, fall back to normal handling + if (!this.regexLiteral()) { + switch (this.peek()) { + case '=': + this.advance(); + this.addToken(TokenKind.ForwardslashEqual); + break; + default: + this.addToken(TokenKind.Forwardslash); + break; + } } }, '\\': function (this: Lexer) { @@ -384,6 +387,19 @@ export class Lexer { this.columnEnd++; } + private lookaheadStack = [] as Array<{ current: number; columnEnd: number }>; + private pushLookahead() { + this.lookaheadStack.push({ + current: this.current, + columnEnd: this.columnEnd + }); + } + private popLookahead() { + const { current, columnEnd } = this.lookaheadStack.pop(); + this.current = current; + this.columnEnd = columnEnd; + } + /** * Returns the character at position `current` or a null character if we've reached the end of * input. @@ -835,6 +851,17 @@ export class Lexer { return candidates.includes(this.source.charAt(this.current)); } + /** + * Advance if the current token matches one of the candidates + */ + private advanceIf(...candidates: string[]) { + if (this.check(...candidates)) { + this.advance(); + return true; + } + return false; + } + /** * Check the previous character */ @@ -927,6 +954,38 @@ export class Lexer { } } + /** + * Capture a regex literal token. Returns false if not found. + * This is lookahead lexing which might techincally belong in the parser, + * but it's easy enough to do here in the lexer + */ + private regexLiteral() { + this.pushLookahead(); + + //finite loop to prevent infinite loop if something went wrong + for (let i = this.current; i < this.source.length; i++) { + + //if we reached the end of the regex, consume any flags + if (this.advanceIf('/')) { + //consume all flag-like chars (let the parser validate the actual values) + while (/[a-z]/i.exec(this.peek())) { + this.advance(); + } + //finalize the regex literal and EXIT + this.addToken(TokenKind.RegexLiteral); + return true; + + //if we found a non-escaped newline, there's a syntax error with this regex (or it's not a regex), so quit + } else if (this.check('\n')) { + break; + } else { + this.advance(); + } + } + this.popLookahead(); + return false; + } + /** * Creates a `Token` and adds it to the `tokens` array. * @param kind the type of token to produce. diff --git a/src/lexer/TokenKind.ts b/src/lexer/TokenKind.ts index f01bb4b0c..70ed8d7df 100644 --- a/src/lexer/TokenKind.ts +++ b/src/lexer/TokenKind.ts @@ -52,6 +52,7 @@ export enum TokenKind { DoubleLiteral = 'DoubleLiteral', LongIntegerLiteral = 'LongIntegerLiteral', EscapedCharCodeLiteral = 'EscapedCharCodeLiteral', //this is used to capture things like `\n`, `\r\n` in template strings + RegexLiteral = 'RegexLiteral', //types Void = 'Void', From d71e614c3990c304479783e21660298f0127cce0 Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 09:11:09 -0400 Subject: [PATCH 2/6] Add lexer support for escaped regexp chars --- src/lexer/Lexer.spec.ts | 30 ++++++++++++++++++++++-------- src/lexer/Lexer.ts | 22 +++++++++------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/src/lexer/Lexer.spec.ts b/src/lexer/Lexer.spec.ts index c5ac046fc..24dab7192 100644 --- a/src/lexer/Lexer.spec.ts +++ b/src/lexer/Lexer.spec.ts @@ -1224,10 +1224,11 @@ describe('lexer', () => { }); describe('regular expression literals', () => { - function testRegex(...regexps) { + function testRegex(...regexps: Array) { + regexps = regexps.map(x => x.toString()); const results = [] as string[]; for (const regexp of regexps) { - const { tokens } = Lexer.scan(regexp); + const { tokens } = Lexer.scan(regexp as string); results.push(tokens[0].text); } expect(results).to.eql(regexps); @@ -1235,12 +1236,25 @@ describe('lexer', () => { it('recognizes regex literals', () => { testRegex( - '/simple/', - '/SimpleWithValidFlags/imsx', - '/UnknownFlags/VUI', - '/with spaces/andflags', - '/with(parens)and[squarebraces]/', - '/*()^$@/' + /simple/, + /SimpleWithValidFlags/g, + /UnknownFlags/gi, + /with spaces/s, + /with(parens)and[squarebraces]/, + //lots of special characters + /.*()^$@/ + ); + }); + + it('handles escape characters properly', () => { + testRegex( + //an escaped forward slash right next to the end-regexp forwardslash + /\//, + /\r/, + /\n/, + /\r\n/, + //a literal backslash in front of an escape backslash + /\\\n/ ); }); }); diff --git a/src/lexer/Lexer.ts b/src/lexer/Lexer.ts index d10c2c95f..ae159303f 100644 --- a/src/lexer/Lexer.ts +++ b/src/lexer/Lexer.ts @@ -851,17 +851,6 @@ export class Lexer { return candidates.includes(this.source.charAt(this.current)); } - /** - * Advance if the current token matches one of the candidates - */ - private advanceIf(...candidates: string[]) { - if (this.check(...candidates)) { - this.advance(); - return true; - } - return false; - } - /** * Check the previous character */ @@ -962,11 +951,14 @@ export class Lexer { private regexLiteral() { this.pushLookahead(); + let nextCharNeedsEscaped = false; + //finite loop to prevent infinite loop if something went wrong for (let i = this.current; i < this.source.length; i++) { //if we reached the end of the regex, consume any flags - if (this.advanceIf('/')) { + if (this.check('/') && !nextCharNeedsEscaped) { + this.advance(); //consume all flag-like chars (let the parser validate the actual values) while (/[a-z]/i.exec(this.peek())) { this.advance(); @@ -976,10 +968,14 @@ export class Lexer { return true; //if we found a non-escaped newline, there's a syntax error with this regex (or it's not a regex), so quit - } else if (this.check('\n')) { + } else if (this.check('\n') || this.isAtEnd()) { break; + } else if (this.check('\\')) { + this.advance(); + nextCharNeedsEscaped = true; } else { this.advance(); + nextCharNeedsEscaped = false; } } this.popLookahead(); From f8a6c1d7ced24d7649691eae1df782161c29e380 Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 10:29:28 -0400 Subject: [PATCH 3/6] Add parser and transpile functionality --- src/parser/Expression.ts | 40 ++++++++++++++ src/parser/Parser.ts | 9 ++++ .../expression/RegexLiteralExpression.spec.ts | 54 +++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 src/parser/tests/expression/RegexLiteralExpression.spec.ts diff --git a/src/parser/Expression.ts b/src/parser/Expression.ts index aef007f9a..073002b3e 100644 --- a/src/parser/Expression.ts +++ b/src/parser/Expression.ts @@ -1406,6 +1406,46 @@ export class NullCoalescingExpression extends Expression { } } +export class RegexLiteralExpression extends Expression { + public constructor( + public tokens: { + regexLiteral: Token; + } + ) { + super(); + } + + public get range() { + return this.tokens.regexLiteral.range; + } + + public transpile(state: BrsTranspileState): TranspileResult { + let text = this.tokens.regexLiteral?.text ?? ''; + let flags = ''; + //get any flags from the end + const flagMatch = /\/([a-z]+)$/i.exec(text); + if (flagMatch) { + text = text.substring(0, flagMatch.index + 1); + flags = flagMatch[1]; + } + //remove leading and trailing slashes + const pattern = text.substring(1, text.length - 1); + + return [ + state.sourceNode(this.tokens.regexLiteral, [ + 'CreateObject("roRegex", ', + `"${pattern}", `, + `"${flags}"`, + ')' + ]) + ]; + } + + walk(visitor: WalkVisitor, options: WalkOptions) { + //nothing to walk + } +} + // eslint-disable-next-line @typescript-eslint/consistent-indexed-object-style type ExpressionValue = string | number | boolean | Expression | ExpressionValue[] | { [key: string]: ExpressionValue }; diff --git a/src/parser/Parser.ts b/src/parser/Parser.ts index 795d7886b..54faa418f 100644 --- a/src/parser/Parser.ts +++ b/src/parser/Parser.ts @@ -93,6 +93,7 @@ import { Logger } from '../Logger'; import { isAnnotationExpression, isCallExpression, isCallfuncExpression, isClassMethodStatement, isCommentStatement, isDottedGetExpression, isIfStatement, isIndexedGetExpression, isVariableExpression } from '../astUtils/reflection'; import { createVisitor, WalkMode } from '../astUtils/visitors'; import { createStringLiteral, createToken } from '../astUtils/creators'; +import { RegexLiteralExpression } from '.'; export class Parser { /** @@ -1396,6 +1397,12 @@ export class Parser { return new NullCoalescingExpression(test, questionQuestionToken, alternate); } + private regexLiteralExpression() { + return new RegexLiteralExpression({ + regexLiteral: this.advance() + }); + } + private templateString(isTagged: boolean): TemplateStringExpression | TaggedTemplateStringExpression { this.warnIfNotBrighterScriptMode('template string'); @@ -2544,6 +2551,8 @@ export class Parser { return new VariableExpression(token, this.currentNamespaceName); case this.checkAny(TokenKind.Function, TokenKind.Sub): return this.anonymousFunction(); + case this.check(TokenKind.RegexLiteral): + return this.regexLiteralExpression(); case this.check(TokenKind.Comment): return new CommentStatement([this.advance()]); default: diff --git a/src/parser/tests/expression/RegexLiteralExpression.spec.ts b/src/parser/tests/expression/RegexLiteralExpression.spec.ts new file mode 100644 index 000000000..1d0bb1f42 --- /dev/null +++ b/src/parser/tests/expression/RegexLiteralExpression.spec.ts @@ -0,0 +1,54 @@ +import { Program } from '../../../Program'; +import { standardizePath as s } from '../../../util'; +import { getTestTranspile } from '../../../testHelpers.spec'; + +describe('RegexLiteralExpression', () => { + let rootDir = s`${process.cwd()}/rootDir`; + let program: Program; + let testTranspile = getTestTranspile(() => [program, rootDir]); + + beforeEach(() => { + program = new Program({ rootDir: rootDir }); + }); + afterEach(() => { + program.dispose(); + }); + + describe('transpile', () => { + it('captures flags', () => { + testTranspile(` + sub main() + print /hello/gi + end sub + `, ` + sub main() + print CreateObject("roRegex", "hello", "gi") + end sub + `); + }); + + it('handles when no flags', () => { + testTranspile(` + sub main() + print /hello/ + end sub + `, ` + sub main() + print CreateObject("roRegex", "hello", "") + end sub + `); + }); + + it('handles weird escapes', () => { + testTranspile(` + sub main() + print /\\r\\n\\// + end sub + `, ` + sub main() + print CreateObject("roRegex", "\\r\\n\\/", "") + end sub + `); + }); + }); +}); From 2dab1f159dca1f047faf1a306de61c7f34e9c5bb Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 10:51:52 -0400 Subject: [PATCH 4/6] Add very basic docs about regex literals --- docs/readme.md | 1 + docs/regex-literals.md | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 docs/regex-literals.md diff --git a/docs/readme.md b/docs/readme.md index 65da5e877..f3e1a04f1 100644 --- a/docs/readme.md +++ b/docs/readme.md @@ -10,6 +10,7 @@ See the following pages for more information - [Namespaces](namespaces.md) - [Null-coalescing operator](null-coalescing-operator.md) - [Plugins](plugins.md) + - [Regular Expression Literals](regex-literals.md) - [Source Literals](source-literals.md) - [Template Strings (Template Literals)](template-strings.md) - [Ternary (Conditional) Operator](ternary-operator.md) diff --git a/docs/regex-literals.md b/docs/regex-literals.md new file mode 100644 index 000000000..a3feae1b2 --- /dev/null +++ b/docs/regex-literals.md @@ -0,0 +1,13 @@ +# Regular Expression Literals +You can create a regular expression literal in brighterscript. This simplifies pattern writing and improves readability. + +Example: +```BrighterScript +print /hello world/ig +``` + +transpiles to: + +```BrightScript +print CreateObject("roRegex","hello world","ig") +``` From a6f1d39188dfeda929d595d8b389ea4d4e3a6f6c Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 11:27:47 -0400 Subject: [PATCH 5/6] Verify lexer handles quotemark properly --- src/lexer/Lexer.spec.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lexer/Lexer.spec.ts b/src/lexer/Lexer.spec.ts index 24dab7192..765c92c49 100644 --- a/src/lexer/Lexer.spec.ts +++ b/src/lexer/Lexer.spec.ts @@ -1242,7 +1242,9 @@ describe('lexer', () => { /with spaces/s, /with(parens)and[squarebraces]/, //lots of special characters - /.*()^$@/ + /.*()^$@/, + //captures quote char + /"/ ); }); From ad07a8882131d41977d8b0cba1614eac63455864 Mon Sep 17 00:00:00 2001 From: Bronley Date: Fri, 24 Sep 2021 13:13:52 -0400 Subject: [PATCH 6/6] Escape quotemarks --- src/parser/Expression.ts | 7 +++++-- .../tests/expression/RegexLiteralExpression.spec.ts | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/parser/Expression.ts b/src/parser/Expression.ts index 073002b3e..279263be5 100644 --- a/src/parser/Expression.ts +++ b/src/parser/Expression.ts @@ -1428,8 +1428,11 @@ export class RegexLiteralExpression extends Expression { text = text.substring(0, flagMatch.index + 1); flags = flagMatch[1]; } - //remove leading and trailing slashes - const pattern = text.substring(1, text.length - 1); + let pattern = text + //remove leading and trailing slashes + .substring(1, text.length - 1) + //escape quotemarks + .split('"').join('" + chr(34) + "'); return [ state.sourceNode(this.tokens.regexLiteral, [ diff --git a/src/parser/tests/expression/RegexLiteralExpression.spec.ts b/src/parser/tests/expression/RegexLiteralExpression.spec.ts index 1d0bb1f42..be5542ad8 100644 --- a/src/parser/tests/expression/RegexLiteralExpression.spec.ts +++ b/src/parser/tests/expression/RegexLiteralExpression.spec.ts @@ -50,5 +50,18 @@ describe('RegexLiteralExpression', () => { end sub `); }); + + it('escapes quotemark', () => { + testTranspile(` + sub main() + print /"/ + end sub + `, ` + sub main() + print CreateObject("roRegex", "" + chr(34) + "", "") + end sub + `); + }); + }); });