Merge pull request #80 from soasme/x

peg: support \xXX
soasme · Jul 28, 2021 · 5a49607 · 5a49607
2 parents c6cc1f8 + c10d2fb
commit 5a49607
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 2 deletions.
diff --git a/docs/peg.rst b/docs/peg.rst
@@ -64,7 +64,13 @@ Emoji is supported:
 
     greeting = "Peppa 🐷";
 
-You can encode UTF-8 code points via `\\u` followed by 4 hex digits or `\\U` followed by 8 hex digits.
+You can encode ASCII characters via `\\x` followed by 2 hex digits.
+
+.. code-block::
+
+    greeting = "\x48\x65\x6c\x6c\x6f, world";
+
+You can encode UTF-8 characters via `\\u` followed by 4 hex digits or `\\U` followed by 8 hex digits.
 
 .. code-block::
 
@@ -360,6 +366,24 @@ In this example, the rule `float` will drop all `number` nodes, leaving only one
 
     number = [0-9];
 
+@scoped
+```````
+
+Ignore all the decorators set by upstream rules.
+
+For example, despite `greeting2` set to not using spaced rule `ws`, `greeting` can still apply to `ws` since it's under its own scope.
+
+.. code-block::
+
+    @tight
+    greeting2 = greeting greeting;
+
+    @scoped
+    greeting = "hello" "world";
+
+    @spaced
+    ws = " ";
+
 
 Use Peg API
 ------------
@@ -425,6 +449,10 @@ Cheatsheet
      - cancle effects
    * - `"literal"`
      - exact match
+   * - `"\x0d\x0a"`
+     - exact match by using ascii digits
+   * - `"\u4f60\u597D"`
+     - exact match utf-8 characters
    * - `i"literal"`
      - case-insensitive match
    * - `[a-z]`

diff --git a/peppapeg.c b/peppapeg.c
@@ -831,6 +831,12 @@ size_t P4_ReadEscapedRune(char* text, P4_Rune* rune) {
         case '"': *rune = 0x22; return 2;
         case '/': *rune = 0x2f; return 2;
         case '\\': *rune = 0x5c; return 2;
+        case 'x': { /* TODO: may not have enough chars. */
+            char chs[3] = {0, 0, 0};
+            memcpy(chs, text + 2, 2);
+            *rune = strtoul(chs, NULL, 16);
+            return 4;
+        }
         case 'u': { /* TODO: may not have enough chars. */
             char chs[5] = {0, 0, 0, 0, 0};
             memcpy(chs, text + 2, 4);
@@ -3218,7 +3224,7 @@ P4_Grammar* P4_CreatePegGrammar () {
         P4_CreateRange(0x5d, 0x10ffff, 1),
         P4_CreateSequenceWithMembers(2,
             P4_CreateLiteral("\\", true),
-            P4_CreateChoiceWithMembers(10,
+            P4_CreateChoiceWithMembers(11,
                 P4_CreateLiteral("\"", true),
                 P4_CreateLiteral("/", true),
                 P4_CreateLiteral("\\", true),
@@ -3227,6 +3233,16 @@ P4_Grammar* P4_CreatePegGrammar () {
                 P4_CreateLiteral("n", true),
                 P4_CreateLiteral("r", true),
                 P4_CreateLiteral("t", true),
+                P4_CreateSequenceWithMembers(2,
+                    P4_CreateLiteral("x", true),
+                    P4_CreateRepeatExact(
+                        P4_CreateChoiceWithMembers(3,
+                            P4_CreateRange('0', '9', 1),
+                            P4_CreateRange('a', 'f', 1),
+                            P4_CreateRange('A', 'F', 1)
+                        ), 2
+                    )
+                ),
                 P4_CreateSequenceWithMembers(2,
                     P4_CreateLiteral("u", true),
                     P4_CreateRepeatExact(

diff --git a/tests/test_peg.c b/tests/test_peg.c
@@ -311,7 +311,9 @@ void test_eval_literal(void) {
     ASSERT_EVAL_GRAMMAR("R1 = \"你好, World\";", "R1", "你好, World", P4_Ok, "[{\"slice\":[0,13],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = \"نامهای\";", "R1", "نامهای", P4_Ok, "[{\"slice\":[0,12],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = \"Peppa PEG 🐷\";", "R1", "Peppa PEG 🐷", P4_Ok, "[{\"slice\":[0,14],\"type\":\"R1\"}]");
+    ASSERT_EVAL_GRAMMAR("R1 = \"\\x48\\x65\\x6c\\x6c\\x6f, world\";", "R1", "Hello, world", P4_Ok, "[{\"slice\":[0,12],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = \"\\u4f60\\u597d, world\";", "R1", "你好, world", P4_Ok, "[{\"slice\":[0,13],\"type\":\"R1\"}]");
+    /* ASSERT_EVAL_GRAMMAR("R1 = \"\\xe4\\xbd\\xa0\\xe5\\xa5\\xbd, world\";", "R1", "你好, world", P4_Ok, "[{\"slice\":[0,13],\"type\":\"R1\"}]"); */ /* Not sure if this one should be the expected behavior. */
     ASSERT_EVAL_GRAMMAR("R1 = \"\\n\";", "R1", "\n", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = \"\\r\";", "R1", "\r", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = \"\\t\";", "R1", "\t", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
@@ -327,6 +329,7 @@ void test_eval_insensitive(void) {
     ASSERT_EVAL_GRAMMAR("R1 = i\"你好, World\";", "R1", "你好, WORLD", P4_Ok, "[{\"slice\":[0,13],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = i\"نامهای\";", "R1", "نامهای", P4_Ok, "[{\"slice\":[0,12],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = i\"Peppa PEG 🐷\";", "R1", "peppa peg 🐷", P4_Ok, "[{\"slice\":[0,14],\"type\":\"R1\"}]");
+    ASSERT_EVAL_GRAMMAR("R1 = i\"\\x48\\x65\\x6c\\x6c\\x6f, world\";", "R1", "HELLO, WORLD", P4_Ok, "[{\"slice\":[0,12],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = i\"\\u4f60\\u597d, world\";", "R1", "你好, WORLD", P4_Ok, "[{\"slice\":[0,13],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = i\"Hello Worìd\";", "R1", "HELLO WORÌD", P4_Ok, "[{\"slice\":[0,12],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = i\"\\n\";", "R1", "\n", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
@@ -339,6 +342,9 @@ void test_eval_range(void) {
     ASSERT_EVAL_GRAMMAR("R1 = [0-9];", "R1", "0", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = [a-z];", "R1", "a", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = [A-Z];", "R1", "A", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
+    ASSERT_EVAL_GRAMMAR("R1 = [\\x30-\\x39];", "R1", "0", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
+    ASSERT_EVAL_GRAMMAR("R1 = [\\x41-\\x5A];", "R1", "A", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
+    ASSERT_EVAL_GRAMMAR("R1 = [\\x61-\\x7A];", "R1", "a", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = [\\u0001-\\U0010ffff];", "R1", "a", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = [你-好];", "R1", "你", P4_Ok, "[{\"slice\":[0,3],\"type\":\"R1\"}]");
     ASSERT_EVAL_GRAMMAR("R1 = [1-9..2];", "R1", "1", P4_Ok, "[{\"slice\":[0,1],\"type\":\"R1\"}]");