Added rules to parse regular expressions

rspivak · May 1, 2011 · 7c3dcee · 7c3dcee
1 parent 8fc3118
commit 7c3dcee
Show file tree

Hide file tree

Showing 2 changed files with 192 additions and 8 deletions.
diff --git a/src/slimit/lexer.py b/src/slimit/lexer.py
@@ -33,6 +33,24 @@
     CONNECTOR_PUNCTUATION,
     )
 
+TOKENS_THAT_IMPLY_DIVISON = frozenset([
+    'ID',
+    'NUMBER',
+    'STRING',
+    'REGEX'
+    'TRUE',
+    'FALSE',
+    'NULL',
+    ])
+
+TOKEN_VALUES_THAT_IMPLY_DIVISON = frozenset([
+    '++',
+    '--',
+    ')',
+    '}',
+    ']',
+    ])
+
 
 class Lexer(object):
     """A JavaScript lexer.
@@ -41,6 +59,10 @@ class Lexer(object):
     http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
     """
 
+    def __init__(self):
+        self.prev_token = None
+        self.cur_token = None
+
     def build(self, **kwargs):
         """Build the lexer."""
         self.lexer = ply.lex.lex(object=self, **kwargs)
@@ -49,9 +71,47 @@ def input(self, text):
         self.lexer.input(text)
 
     def token(self):
-        return self.lexer.token()
+        lexer = self.lexer
+        pos = lexer.lexpos
+        try:
+            char = lexer.lexdata[pos]
+            while char.isspace():
+                pos += 1
+                char = lexer.lexdata[pos]
+            next_char = lexer.lexdata[pos + 1]
+        except IndexError:
+            self.prev_token = self.cur_token
+            self.cur_token = lexer.token()
+            return self.cur_token
+
+        if char != '/' or (char == '/' and next_char in ('/', '*')):
+            self.prev_token = self.cur_token
+            self.cur_token = lexer.token()
+            return self.cur_token
+
+        # current character is / which is either division or regex
+        cur_token = self.cur_token
+        is_division_allowed = (
+            cur_token is not None and
+            (cur_token.type in TOKENS_THAT_IMPLY_DIVISON or
+             cur_token.value in TOKEN_VALUES_THAT_IMPLY_DIVISON)
+            )
+        if is_division_allowed:
+            self.prev_token = self.cur_token
+            self.cur_token = lexer.token()
+            return self.cur_token
+        else:
+            self.prev_token = self.cur_token
+            self.cur_token = self._read_regex()
+            return self.cur_token
+
+    def _read_regex(self):
+        self.lexer.begin('regex')
+        token = self.lexer.token()
+        self.lexer.begin('INITIAL')
+        return token
 
-    # Iterator protocol
+    # iterator protocol
     def __iter__(self):
         return self
 
@@ -62,6 +122,10 @@ def next(self):
 
         return token
 
+    states = (
+        ('regex', 'exclusive'),
+        )
+
     keywords = (
         'BREAK', 'CASE', 'CATCH', 'CONTINUE', 'DEBUGGER', 'DEFAULT', 'DELETE',
         'DO', 'ELSE', 'FINALLY', 'FOR', 'FUNCTION', 'IF', 'IN',
@@ -93,7 +157,7 @@ def next(self):
         'XOREQUAL', 'OREQUAL',            # ^= and |=
 
         # Terminal types
-        'NUMBER', 'STRING', 'ID', 'REGEXP',
+        'NUMBER', 'STRING', 'ID', 'REGEX',
 
         # Comments
         'LINE_COMMENT', 'BLOCK_COMMENT',
@@ -102,6 +166,38 @@ def next(self):
         # 'AUTOPLUSPLUS', 'AUTOMINUSMINUS', 'IF_WITHOUT_ELSE',
         ) + keywords
 
+    # taken from jslex
+    t_regex_REGEX = r"""(?:
+        /                       # opening slash
+        # First character is..
+        (?: [^*\\/[]            # anything but * \ / or [
+        |   \\.                 # or an escape sequence
+        |   \[                  # or a class, which has
+                (   [^\]\\]     #   anything but \ or ]
+                |   \\.         #   or an escape sequence
+                )*              #   many times
+            \]
+        )
+        # Following characters are same, except for excluding a star
+        (?: [^\\/[]             # anything but \ / or [
+        |   \\.                 # or an escape sequence
+        |   \[                  # or a class, which has
+                (   [^\]\\]     #   anything but \ or ]
+                |   \\.         #   or an escape sequence
+                )*              #   many times
+            \]
+        )*                      # many times
+        /                       # closing slash
+        [a-zA-Z0-9]*            # trailing flags
+        )
+        """
+    t_regex_ignore = ' \t'
+    def t_regex_error(self, token):
+        raise TypeError(
+            "Error parsing regular expression '%s' at %s" % (
+                token.value, token.lineno)
+            )
+
     # Punctuators
     literals = (
         ',', '.', ';', ':',
@@ -181,7 +277,7 @@ def next(self):
         |
         (?:' # single quoted string
             (?:
-                [^"\\\n\r]             # no escape chars, line terminators or "
+                [^'\\\n\r]             # no escape chars, line terminators or '
                 |
                 \\[a-zA-Z\\'"?]        # escaped characters
                 |
@@ -191,7 +287,7 @@ def next(self):
             )*?
         ')
     )
-    """
+    """ # "
 
     # Literals
     def t_NULL(self, token):

diff --git a/src/slimit/tests/test_lexer.py b/src/slimit/tests/test_lexer.py
@@ -77,12 +77,13 @@ def assertListEqual(self, first, second):
          ),
 
         # Punctuators
+        ('a /= b', ['ID a', 'DIVEQUAL /=', 'ID b']),
         (('== != === !== < > <= >= || && ++ -- << >> '
-          '>>> += -= *= /= <<= >>= >>>= &= %= ^= |='),
+          '>>> += -= *= <<= >>= >>>= &= %= ^= |='),
          ['EQEQ ==', 'NE !=', 'STREQ ===', 'STRNEQ !==', 'LT <', 'GT >',
           'LE <=', 'GE >=', 'OR ||', 'AND &&', 'PLUSPLUS ++', 'MINUSMINUS --',
           'LSHIFT <<', 'RSHIFT >>', 'URSHIFT >>>', 'PLUSEQUAL +=',
-          'MINUSEQUAL -=', 'MULTEQUAL *=', 'DIVEQUAL /=', 'LSHIFTEQUAL <<=',
+          'MINUSEQUAL -=', 'MULTEQUAL *=', 'LSHIFTEQUAL <<=',
           'RSHIFTEQUAL >>=', 'URSHIFTEQUAL >>>=', 'ANDEQUAL &=', 'MODEQUAL %=',
           'XOREQUAL ^=', 'OREQUAL |=',
           ]
@@ -106,6 +107,7 @@ def assertListEqual(self, first, second):
          ),
 
         # Strings
+        (""" '"' """, ["""STRING '"'"""]),
         (r'''"foo" 'foo' "x\";" 'x\';' "foo\tbar"''',
          ['STRING "foo"', """STRING 'foo'""", r'STRING "x\";"',
           r"STRING 'x\';'", r'STRING "foo\tbar"']
@@ -129,7 +131,93 @@ def assertListEqual(self, first, second):
          ['BLOCK_COMMENT /*\nCopyright LGPL 2011\n*/',
           'ID a', '= =', 'NUMBER 1', '; ;']
          ),
-        ]
+
+        # regex
+        (r'a=/a*/,1', ['ID a', '= =', 'REGEX /a*/', ', ,', 'NUMBER 1']),
+        (r'a=/a*[^/]+/,1',
+         ['ID a', '= =', 'REGEX /a*[^/]+/', ', ,', 'NUMBER 1']
+         ),
+        (r'a=/a*\[^/,1', ['ID a', '= =', r'REGEX /a*\[^/', ', ,', 'NUMBER 1']),
+        (r'a=/\//,1', ['ID a', '= =', r'REGEX /\//', ', ,', 'NUMBER 1']),
+
+        # next two are from http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
+        ("""for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}""",
+         ["FOR for", "( (", "VAR var", "ID x", "= =", "ID a", "IN in",
+          "ID foo", "AND &&", 'STRING "</x>"', "OR ||", "ID mot", "? ?", "ID z",
+          ": :", "REGEX /x:3;x<5;y</g", "/ /", "ID i", ") )", "{ {",
+          "ID xyz", "( (", "ID x", "PLUSPLUS ++", ") )", "; ;", "} }"]
+         ),
+        ("""for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}""",
+         ["FOR for", "( (", "VAR var", "ID x", "= =", "ID a", "IN in",
+          "ID foo", "AND &&", 'STRING "</x>"', "OR ||", "ID mot", "? ?", "ID z",
+          "/ /", "ID x", ": :", "NUMBER 3", "; ;", "ID x", "LT <", "NUMBER 5",
+          "; ;", "ID y", "LT <", "REGEX /g/i", ") )", "{ {",
+          "ID xyz", "( (", "ID x", "PLUSPLUS ++", ") )", "; ;", "} }"]
+         ),
+
+        # Various "illegal" regexes that are valid according to the std.
+        (r"""/????/, /++++/, /[----]/ """,
+         ["REGEX /????/", ", ,", "REGEX /++++/", ", ,", "REGEX /[----]/"]
+         ),
+
+        # Stress cases from http://stackoverflow.com/questions/5533925/what-javascript-constructs-does-jslex-incorrectly-lex/5573409#5573409
+        (r"""/\[/""", [r"""REGEX /\[/"""]),
+        (r"""/[i]/""", [r"""REGEX /[i]/"""]),
+        (r"""/[\]]/""", [r"""REGEX /[\]]/"""]),
+        (r"""/a[\]]/""", [r"""REGEX /a[\]]/"""]),
+        (r"""/a[\]]b/""", [r"""REGEX /a[\]]b/"""]),
+        (r"""/[\]/]/gi""", [r"""REGEX /[\]/]/gi"""]),
+        (r"""/\[[^\]]+\]/gi""", [r"""REGEX /\[[^\]]+\]/gi"""]),
+        ("""
+            rexl.re = {
+            NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
+            UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
+            QUOTED_LITERAL: /^'(?:[^']|'')*'/,
+            NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
+            SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
+            };
+            """,
+         ["ID rexl", ". .", "ID re", "= =", "{ {",
+          "ID NAME", ": :",
+          r"""REGEX /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", ", ,",
+          "ID UNQUOTED_LITERAL", ": :",
+          r"""REGEX /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", ", ,",
+         "ID QUOTED_LITERAL", ": :", r"""REGEX /^'(?:[^']|'')*'/""", ", ,",
+         "ID NUMERIC_LITERAL", ": :",
+         r"""REGEX /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", ", ,",
+         "ID SYMBOL", ": :",
+         r"""REGEX /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
+         "} }", "; ;"]
+          ),
+        ("""
+            rexl.re = {
+            NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
+            UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
+            QUOTED_LITERAL: /^'(?:[^']|'')*'/,
+            NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
+            SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
+            };
+            str = '"';
+        """,
+        ["ID rexl", ". .", "ID re", "= =", "{ {",
+         "ID NAME", ": :", r"""REGEX /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", ", ,",
+         "ID UNQUOTED_LITERAL", ": :",
+         r"""REGEX /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", ", ,",
+         "ID QUOTED_LITERAL", ": :", r"""REGEX /^'(?:[^']|'')*'/""", ", ,",
+         "ID NUMERIC_LITERAL", ": :",
+         r"""REGEX /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", ", ,",
+         "ID SYMBOL", ": :",
+         r"""REGEX /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
+         "} }", "; ;",
+         "ID str", "= =", """STRING '"'""", "; ;",
+         ]),
+        (r""" this._js = "e.str(\"" + this.value.replace(/\\/g, "\\\\").replace(/"/g, "\\\"") + "\")"; """,
+         ["THIS this", ". .", "ID _js", "= =",
+          r'''STRING "e.str(\""''', "+ +", "THIS this", ". .",
+          "ID value", ". .", "ID replace", "( (", r"REGEX /\\/g", ", ,",
+          r'STRING "\\\\"', ") )", ". .", "ID replace", "( (", r'REGEX /"/g',
+          ", ,", r'STRING "\\\""', ") )", "+ +", r'STRING "\")"', "; ;"]),
+        ] # "
 
 
 def make_test_function(input, expected):