From e416cd8d9fb7a92816ec4d364e82fbc9da2a6ce9 Mon Sep 17 00:00:00 2001
From: theseanl <i73hi64d0wr5df8pckig@gmail.com>
Date: Sun, 21 Jan 2024 03:32:16 +0900
Subject: [PATCH] feat: support multi-stop token in 'until', macro delimiters

Until argspec's behavior was fixed, and while doing so, a support for
multi-token stops was added. Also, now it properly supports macro
delimiters (which was really just a by-product of applying uniform
treatment to any logic related to finding braces). This fixes
https://github.com/siefkenj/unified-latex/issues/46.
---
 .../libs/gobble-single-argument.ts            | 27 ++++----
 .../tests/gobble-single-argument.test.ts      | 68 +++++++++++++++++++
 .../grammars/xparse-argspec.pegjs             | 15 ++--
 packages/unified-latex-util-scan/libs/scan.ts |  6 +-
 4 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts b/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts
index 9c1ba8ea..a1854537 100644
--- a/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts
+++ b/packages/unified-latex-util-arguments/libs/gobble-single-argument.ts
@@ -48,8 +48,8 @@ export function gobbleSingleArgument(
             }
         };
 
-    const openMark = parseBrace(argSpec.openBrace);
-    const closeMark = parseBrace(argSpec.closeBrace);
+    const openMark = parseToken(argSpec.openBrace);
+    const closeMark = parseToken(argSpec.closeBrace);
 
     // Only mandatory arguments can be wrapped in {...}.
     // Since we already parse such things as groups, we need to
@@ -148,13 +148,10 @@ export function gobbleSingleArgument(
                 break;
             }
             case "until": {
-                const stopTokens: (string | Ast.Whitespace)[] = argSpec.stopTokens.map(rawToken => {
-                    if (rawToken === " ") {
-                        return { type: "whitespace" };
-                    }
-                    return rawToken;
-                });
-
+                const stopTokens = argSpec.stopTokens.map(parseToken);
+                // TODO: in order to match xparse's behavior, multiple spaces at the start
+                // or in a middle should be collapsed to a single whitespace token,
+                // and spaces at the end should be ignored.
                 let nextStartPos = startPos;
                 let bracePos: [number, number] | undefined;
                 while (nextStartPos < nodes.length) {
@@ -189,7 +186,9 @@ export function gobbleSingleArgument(
                     openMark: "",
                     closeMark: printRaw(argSpec.stopTokens),
                 });
-                currPos = bracePos[1];
+                // Since `stopTokens` may comprise of more than one token,
+                // we need to advance `currPos` further
+                currPos = bracePos[1] + stopTokens.length - 1;
                 if (currPos < nodes.length) {
                     currPos++;
                 }
@@ -332,13 +331,11 @@ function findDelimiter(nodes: Ast.Node[], token: Braces, startPos: number, endPo
     return closeMarkPos;
 }
 
-function parseBrace(str: string | undefined): string | Ast.Macro {
+function parseToken(str: string | undefined): string | Ast.Whitespace | Ast.Macro {
     if (!str) { return ""; }
+    if (!str.trim()) { return { type: "whitespace" }; }
     if (str.startsWith("\\")) {
-        return {
-            type: "macro",
-            content: str.slice(1)
-        }
+        return { type: "macro", content: str.slice(1) };
     }
     return str;
 }
\ No newline at end of file
diff --git a/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts b/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts
index 702820fe..2c4dc9a1 100644
--- a/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts
+++ b/packages/unified-latex-util-arguments/tests/gobble-single-argument.test.ts
@@ -498,6 +498,7 @@ describe("unified-latex-util-arguments", () => {
             { type: "string", content: "]" },
             { type: "string", content: "y" },
         ];
+
         expect(
             gobbleSingleArgument([...ast], parseArgspec("!o")[0])
         ).toMatchObject({
@@ -599,6 +600,38 @@ describe("unified-latex-util-arguments", () => {
         });
         expect(nodes).toEqual([{ content: "yx", type: "string" }]);
     });
+    it("can gobble an 'until' argument with multiple stop tokens", () => {
+        let argspec = parseArgspec("u{a \\bcd}")[0];
+        value = "asdf asydfxya{x}sa \\bcd2df";
+        file = processLatexToAstViaUnified().processSync({ value });
+        let nodes = trimRenderInfo((file.result as any).content) as Ast.Node[];
+        expect(gobbleSingleArgument(nodes, argspec)).toEqual({
+            argument: {
+                type: "argument",
+                content: [
+                    // Due to a current implementation of gobbleSingleArgument,
+                    // we may introduce extra string split during the search.
+                    { "type": "string", "content": "a" },
+                    { "type": "string", "content": "sdf" },
+                    { "type": "whitespace" },
+                    { "type": "string", "content": "a" },
+                    { "type": "string", "content": "sydfxy" },
+                    { "type": "string", "content": "a" },
+                    { "type": "group", "content": [{ "type": "string", "content": "x" }] },
+                    { "type": "string", "content": "s" },
+                ],
+                openMark: "",
+                closeMark: "a \\bcd",
+            },
+            nodesRemoved: 11,
+        });
+        expect(nodes).toEqual([
+            {
+                "type": "string",
+                "content": "2df"
+            }
+        ]);
+    });
     it("gobbleSingleArgument gobbles non-punctuation delimited arguments", () => {
         let ast: Ast.Node[] = [
             { type: "whitespace" },
@@ -692,6 +725,41 @@ describe("unified-latex-util-arguments", () => {
             }
         );
     });
+    it("gobbleSingleArgument gobbles arguments delimited by tokens", () => {
+        let ast: Ast.Node[] = [
+            { "type": "macro", "content": "a" },
+            { "type": "group", "content": [{ "type": "string", "content": "123" }] },
+            { "type": "string", "content": "1" }
+        ];
+        expect(gobbleSingleArgument(ast, parseArgspec("r\\a{ 1 }")[0])).toMatchObject(
+            {
+                argument: {
+                    type: "argument",
+                    content: [{ type: "group", content: [{ type: "string", content: "123" }] }],
+                    openMark: "\\a",
+                    closeMark: "1",
+                },
+                nodesRemoved: 3,
+            }
+        );
+
+        ast = [
+            { "type": "macro", "content": "abc" },
+            { "type": "string", "content": "123" },
+            { "type": "macro", "content": "def" }
+        ];
+        expect(gobbleSingleArgument(ast, parseArgspec("r\\abc\\def")[0])).toMatchObject(
+            {
+                argument: {
+                    type: "argument",
+                    content: [{ type: "string", content: "123" }],
+                    openMark: "\\abc",
+                    closeMark: "\\def",
+                },
+                nodesRemoved: 3,
+            }
+        );
+    })
     it("can gobble embellishments", () => {
         let ast: Ast.Node[] = [{ type: "string", content: "xxx" }];
         expect(gobbleSingleArgument(ast, parseArgspec("e{}")[0])).toMatchObject(
diff --git a/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs b/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs
index 6404ca27..977de25b 100644
--- a/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs
+++ b/packages/unified-latex-util-pegjs/grammars/xparse-argspec.pegjs
@@ -88,9 +88,9 @@ required
         }
     / "r" braceSpec:brace_spec { return createNode("mandatory", braceSpec); }
 
-// An "until" argument gobbles tokens until the specified stop token(s)
+// An "until" argument gobbles tokens until the specified stop token(s). Until token allows whitespace.
 until
-    = "u" stopTokens:(x:until_token { return [x] } / '{' @(until_token+) '}') {
+    = "u" stopTokens:(x:token { return [x] } / '{' @(token_or_whitespace+) '}') {
             return createNode("until", { stopTokens });
         }
 
@@ -107,7 +107,7 @@ brace_spec
     / "{}" { return { openBrace: "{", closeBrace: "}"}}
 
 braced_group
-    = "{" content:(macro_name / non_brace / braced_group)* "}" {
+    = "{" content:( token_or_whitespace / braced_group)* "}" {
             return content;
         }
 
@@ -122,13 +122,12 @@ macro_name
 token
     = macro_name / non_brace
 
-// Until token allows whitespace
-until_token
-	= macro_name / ![{}] @.
+token_or_whitespace
+	= token / whitespace_token
 
-// No need to separate individual characters here
+// No need to separate individual characters here, just need to trim enclosing whitespaces
 group
-    = x:braced_group { return x.map(arrayContent).join(''); }
+    = x:braced_group { return x.map(arrayContent).join('').trim(); }
 
 token_or_group
     = token / group
diff --git a/packages/unified-latex-util-scan/libs/scan.ts b/packages/unified-latex-util-scan/libs/scan.ts
index 0c182fca..48bc96c1 100644
--- a/packages/unified-latex-util-scan/libs/scan.ts
+++ b/packages/unified-latex-util-scan/libs/scan.ts
@@ -20,7 +20,7 @@ export function scan(
          */
         endIndex?: number;
         /**
-         * If `true`, whitespace and comments will be skilled but any other
+         * If `true`, whitespace and comments will be skipped but any other
          * node that doesn't match `token` will cause the scan to terminate.
          */
         onlySkipWhitespaceAndComments?: boolean;
@@ -36,8 +36,10 @@ export function scan(
     if (typeof token === "string") {
         token = { type: "string", content: token } as Ast.String;
     }
+    const start = typeof startIndex === "number" ? startIndex : 0;
+    const end = typeof endIndex === "number" ? endIndex : nodes.length - 1;
 
-    for (let i = startIndex || 0; i <= (endIndex || nodes.length - 1); i++) {
+    for (let i = start; i <= end; i++) {
         const node = nodes[i];
         if (node.type === token.type) {
             switch (node.type) {