From e416cd8d9fb7a92816ec4d364e82fbc9da2a6ce9 Mon Sep 17 00:00:00 2001 From: theseanl Date: Sun, 21 Jan 2024 03:32:16 +0900 Subject: [PATCH] feat: support multi-stop token in 'until', macro delimiters Until argspec's behavior was fixed, and while doing so, a support for multi-token stops was added. Also, now it properly supports macro delimiters (which was really just a by-product of applying uniform treatment to any logic related to finding braces). This fixes https://github.com/siefkenj/unified-latex/issues/46. --- .../libs/gobble-single-argument.ts | 27 ++++---- .../tests/gobble-single-argument.test.ts | 68 +++++++++++++++++++ .../grammars/xparse-argspec.pegjs | 15 ++-- packages/unified-latex-util-scan/libs/scan.ts | 6 +- 4 files changed, 91 insertions(+), 25 deletions(-) diff --git a/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts b/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts index 9c1ba8ea..a1854537 100644 --- a/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts +++ b/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts @@ -48,8 +48,8 @@ export function gobbleSingleArgument( } }; - const openMark = parseBrace(argSpec.openBrace); - const closeMark = parseBrace(argSpec.closeBrace); + const openMark = parseToken(argSpec.openBrace); + const closeMark = parseToken(argSpec.closeBrace); // Only mandatory arguments can be wrapped in {...}. // Since we already parse such things as groups, we need to @@ -148,13 +148,10 @@ export function gobbleSingleArgument( break; } case "until": { - const stopTokens: (string | Ast.Whitespace)[] = argSpec.stopTokens.map(rawToken => { - if (rawToken === " ") { - return { type: "whitespace" }; - } - return rawToken; - }); - + const stopTokens = argSpec.stopTokens.map(parseToken); + // TODO: in order to match xparse's behavior, multiple spaces at the start + // or in a middle should be collapsed to a single whitespace token, + // and spaces at the end should be ignored. let nextStartPos = startPos; let bracePos: [number, number] | undefined; while (nextStartPos < nodes.length) { @@ -189,7 +186,9 @@ export function gobbleSingleArgument( openMark: "", closeMark: printRaw(argSpec.stopTokens), }); - currPos = bracePos[1]; + // Since `stopTokens` may comprise of more than one token, + // we need to advance `currPos` further + currPos = bracePos[1] + stopTokens.length - 1; if (currPos < nodes.length) { currPos++; } @@ -332,13 +331,11 @@ function findDelimiter(nodes: Ast.Node[], token: Braces, startPos: number, endPo return closeMarkPos; } -function parseBrace(str: string | undefined): string | Ast.Macro { +function parseToken(str: string | undefined): string | Ast.Whitespace | Ast.Macro { if (!str) { return ""; } + if (!str.trim()) { return { type: "whitespace" }; } if (str.startsWith("\\")) { - return { - type: "macro", - content: str.slice(1) - } + return { type: "macro", content: str.slice(1) }; } return str; } \ No newline at end of file diff --git a/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts b/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts index 702820fe..2c4dc9a1 100644 --- a/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts +++ b/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts @@ -498,6 +498,7 @@ describe("unified-latex-util-arguments", () => { { type: "string", content: "]" }, { type: "string", content: "y" }, ]; + expect( gobbleSingleArgument([...ast], parseArgspec("!o")[0]) ).toMatchObject({ @@ -599,6 +600,38 @@ describe("unified-latex-util-arguments", () => { }); expect(nodes).toEqual([{ content: "yx", type: "string" }]); }); + it("can gobble an 'until' argument with multiple stop tokens", () => { + let argspec = parseArgspec("u{a \\bcd}")[0]; + value = "asdf asydfxya{x}sa \\bcd2df"; + file = processLatexToAstViaUnified().processSync({ value }); + let nodes = trimRenderInfo((file.result as any).content) as Ast.Node[]; + expect(gobbleSingleArgument(nodes, argspec)).toEqual({ + argument: { + type: "argument", + content: [ + // Due to a current implementation of gobbleSingleArgument, + // we may introduce extra string split during the search. + { "type": "string", "content": "a" }, + { "type": "string", "content": "sdf" }, + { "type": "whitespace" }, + { "type": "string", "content": "a" }, + { "type": "string", "content": "sydfxy" }, + { "type": "string", "content": "a" }, + { "type": "group", "content": [{ "type": "string", "content": "x" }] }, + { "type": "string", "content": "s" }, + ], + openMark: "", + closeMark: "a \\bcd", + }, + nodesRemoved: 11, + }); + expect(nodes).toEqual([ + { + "type": "string", + "content": "2df" + } + ]); + }); it("gobbleSingleArgument gobbles non-punctuation delimited arguments", () => { let ast: Ast.Node[] = [ { type: "whitespace" }, @@ -692,6 +725,41 @@ describe("unified-latex-util-arguments", () => { } ); }); + it("gobbleSingleArgument gobbles arguments delimited by tokens", () => { + let ast: Ast.Node[] = [ + { "type": "macro", "content": "a" }, + { "type": "group", "content": [{ "type": "string", "content": "123" }] }, + { "type": "string", "content": "1" } + ]; + expect(gobbleSingleArgument(ast, parseArgspec("r\\a{ 1 }")[0])).toMatchObject( + { + argument: { + type: "argument", + content: [{ type: "group", content: [{ type: "string", content: "123" }] }], + openMark: "\\a", + closeMark: "1", + }, + nodesRemoved: 3, + } + ); + + ast = [ + { "type": "macro", "content": "abc" }, + { "type": "string", "content": "123" }, + { "type": "macro", "content": "def" } + ]; + expect(gobbleSingleArgument(ast, parseArgspec("r\\abc\\def")[0])).toMatchObject( + { + argument: { + type: "argument", + content: [{ type: "string", content: "123" }], + openMark: "\\abc", + closeMark: "\\def", + }, + nodesRemoved: 3, + } + ); + }) it("can gobble embellishments", () => { let ast: Ast.Node[] = [{ type: "string", content: "xxx" }]; expect(gobbleSingleArgument(ast, parseArgspec("e{}")[0])).toMatchObject( diff --git a/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs b/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs index 6404ca27..977de25b 100644 --- a/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs +++ b/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs @@ -88,9 +88,9 @@ required } / "r" braceSpec:brace_spec { return createNode("mandatory", braceSpec); } -// An "until" argument gobbles tokens until the specified stop token(s) +// An "until" argument gobbles tokens until the specified stop token(s). Until token allows whitespace. until - = "u" stopTokens:(x:until_token { return [x] } / '{' @(until_token+) '}') { + = "u" stopTokens:(x:token { return [x] } / '{' @(token_or_whitespace+) '}') { return createNode("until", { stopTokens }); } @@ -107,7 +107,7 @@ brace_spec / "{}" { return { openBrace: "{", closeBrace: "}"}} braced_group - = "{" content:(macro_name / non_brace / braced_group)* "}" { + = "{" content:( token_or_whitespace / braced_group)* "}" { return content; } @@ -122,13 +122,12 @@ macro_name token = macro_name / non_brace -// Until token allows whitespace -until_token - = macro_name / ![{}] @. +token_or_whitespace + = token / whitespace_token -// No need to separate individual characters here +// No need to separate individual characters here, just need to trim enclosing whitespaces group - = x:braced_group { return x.map(arrayContent).join(''); } + = x:braced_group { return x.map(arrayContent).join('').trim(); } token_or_group = token / group diff --git a/packages/unified-latex-util-scan/libs/scan.ts b/packages/unified-latex-util-scan/libs/scan.ts index 0c182fca..48bc96c1 100644 --- a/packages/unified-latex-util-scan/libs/scan.ts +++ b/packages/unified-latex-util-scan/libs/scan.ts @@ -20,7 +20,7 @@ export function scan( */ endIndex?: number; /** - * If `true`, whitespace and comments will be skilled but any other + * If `true`, whitespace and comments will be skipped but any other * node that doesn't match `token` will cause the scan to terminate. */ onlySkipWhitespaceAndComments?: boolean; @@ -36,8 +36,10 @@ export function scan( if (typeof token === "string") { token = { type: "string", content: token } as Ast.String; } + const start = typeof startIndex === "number" ? startIndex : 0; + const end = typeof endIndex === "number" ? endIndex : nodes.length - 1; - for (let i = startIndex || 0; i <= (endIndex || nodes.length - 1); i++) { + for (let i = start; i <= end; i++) { const node = nodes[i]; if (node.type === token.type) { switch (node.type) {