Skip to content

Commit

Permalink
Added rules to parse regular expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
rspivak committed May 1, 2011
1 parent 8fc3118 commit 7c3dcee
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 8 deletions.
106 changes: 101 additions & 5 deletions src/slimit/lexer.py
Expand Up @@ -33,6 +33,24 @@
CONNECTOR_PUNCTUATION,
)

TOKENS_THAT_IMPLY_DIVISON = frozenset([
'ID',
'NUMBER',
'STRING',
'REGEX'
'TRUE',
'FALSE',
'NULL',
])

TOKEN_VALUES_THAT_IMPLY_DIVISON = frozenset([
'++',
'--',
')',
'}',
']',
])


class Lexer(object):
"""A JavaScript lexer.
Expand All @@ -41,6 +59,10 @@ class Lexer(object):
http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
"""

def __init__(self):
self.prev_token = None
self.cur_token = None

def build(self, **kwargs):
"""Build the lexer."""
self.lexer = ply.lex.lex(object=self, **kwargs)
Expand All @@ -49,9 +71,47 @@ def input(self, text):
self.lexer.input(text)

def token(self):
return self.lexer.token()
lexer = self.lexer
pos = lexer.lexpos
try:
char = lexer.lexdata[pos]
while char.isspace():
pos += 1
char = lexer.lexdata[pos]
next_char = lexer.lexdata[pos + 1]
except IndexError:
self.prev_token = self.cur_token
self.cur_token = lexer.token()
return self.cur_token

if char != '/' or (char == '/' and next_char in ('/', '*')):
self.prev_token = self.cur_token
self.cur_token = lexer.token()
return self.cur_token

# current character is / which is either division or regex
cur_token = self.cur_token
is_division_allowed = (
cur_token is not None and
(cur_token.type in TOKENS_THAT_IMPLY_DIVISON or
cur_token.value in TOKEN_VALUES_THAT_IMPLY_DIVISON)
)
if is_division_allowed:
self.prev_token = self.cur_token
self.cur_token = lexer.token()
return self.cur_token
else:
self.prev_token = self.cur_token
self.cur_token = self._read_regex()
return self.cur_token

def _read_regex(self):
self.lexer.begin('regex')
token = self.lexer.token()
self.lexer.begin('INITIAL')
return token

# Iterator protocol
# iterator protocol
def __iter__(self):
return self

Expand All @@ -62,6 +122,10 @@ def next(self):

return token

states = (
('regex', 'exclusive'),
)

keywords = (
'BREAK', 'CASE', 'CATCH', 'CONTINUE', 'DEBUGGER', 'DEFAULT', 'DELETE',
'DO', 'ELSE', 'FINALLY', 'FOR', 'FUNCTION', 'IF', 'IN',
Expand Down Expand Up @@ -93,7 +157,7 @@ def next(self):
'XOREQUAL', 'OREQUAL', # ^= and |=

# Terminal types
'NUMBER', 'STRING', 'ID', 'REGEXP',
'NUMBER', 'STRING', 'ID', 'REGEX',

# Comments
'LINE_COMMENT', 'BLOCK_COMMENT',
Expand All @@ -102,6 +166,38 @@ def next(self):
# 'AUTOPLUSPLUS', 'AUTOMINUSMINUS', 'IF_WITHOUT_ELSE',
) + keywords

# taken from jslex
t_regex_REGEX = r"""(?:
/ # opening slash
# First character is..
(?: [^*\\/[] # anything but * \ / or [
| \\. # or an escape sequence
| \[ # or a class, which has
( [^\]\\] # anything but \ or ]
| \\. # or an escape sequence
)* # many times
\]
)
# Following characters are same, except for excluding a star
(?: [^\\/[] # anything but \ / or [
| \\. # or an escape sequence
| \[ # or a class, which has
( [^\]\\] # anything but \ or ]
| \\. # or an escape sequence
)* # many times
\]
)* # many times
/ # closing slash
[a-zA-Z0-9]* # trailing flags
)
"""
t_regex_ignore = ' \t'
def t_regex_error(self, token):
raise TypeError(
"Error parsing regular expression '%s' at %s" % (
token.value, token.lineno)
)

# Punctuators
literals = (
',', '.', ';', ':',
Expand Down Expand Up @@ -181,7 +277,7 @@ def next(self):
|
(?:' # single quoted string
(?:
[^"\\\n\r] # no escape chars, line terminators or "
[^'\\\n\r] # no escape chars, line terminators or '
|
\\[a-zA-Z\\'"?] # escaped characters
|
Expand All @@ -191,7 +287,7 @@ def next(self):
)*?
')
)
"""
""" # "

# Literals
def t_NULL(self, token):
Expand Down
94 changes: 91 additions & 3 deletions src/slimit/tests/test_lexer.py
Expand Up @@ -77,12 +77,13 @@ def assertListEqual(self, first, second):
),

# Punctuators
('a /= b', ['ID a', 'DIVEQUAL /=', 'ID b']),
(('== != === !== < > <= >= || && ++ -- << >> '
'>>> += -= *= /= <<= >>= >>>= &= %= ^= |='),
'>>> += -= *= <<= >>= >>>= &= %= ^= |='),
['EQEQ ==', 'NE !=', 'STREQ ===', 'STRNEQ !==', 'LT <', 'GT >',
'LE <=', 'GE >=', 'OR ||', 'AND &&', 'PLUSPLUS ++', 'MINUSMINUS --',
'LSHIFT <<', 'RSHIFT >>', 'URSHIFT >>>', 'PLUSEQUAL +=',
'MINUSEQUAL -=', 'MULTEQUAL *=', 'DIVEQUAL /=', 'LSHIFTEQUAL <<=',
'MINUSEQUAL -=', 'MULTEQUAL *=', 'LSHIFTEQUAL <<=',
'RSHIFTEQUAL >>=', 'URSHIFTEQUAL >>>=', 'ANDEQUAL &=', 'MODEQUAL %=',
'XOREQUAL ^=', 'OREQUAL |=',
]
Expand All @@ -106,6 +107,7 @@ def assertListEqual(self, first, second):
),

# Strings
(""" '"' """, ["""STRING '"'"""]),
(r'''"foo" 'foo' "x\";" 'x\';' "foo\tbar"''',
['STRING "foo"', """STRING 'foo'""", r'STRING "x\";"',
r"STRING 'x\';'", r'STRING "foo\tbar"']
Expand All @@ -129,7 +131,93 @@ def assertListEqual(self, first, second):
['BLOCK_COMMENT /*\nCopyright LGPL 2011\n*/',
'ID a', '= =', 'NUMBER 1', '; ;']
),
]

# regex
(r'a=/a*/,1', ['ID a', '= =', 'REGEX /a*/', ', ,', 'NUMBER 1']),
(r'a=/a*[^/]+/,1',
['ID a', '= =', 'REGEX /a*[^/]+/', ', ,', 'NUMBER 1']
),
(r'a=/a*\[^/,1', ['ID a', '= =', r'REGEX /a*\[^/', ', ,', 'NUMBER 1']),
(r'a=/\//,1', ['ID a', '= =', r'REGEX /\//', ', ,', 'NUMBER 1']),

# next two are from http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
("""for (var x = a in foo && "</x>" || mot ? z:/x:3;x<5;y</g/i) {xyz(x++);}""",
["FOR for", "( (", "VAR var", "ID x", "= =", "ID a", "IN in",
"ID foo", "AND &&", 'STRING "</x>"', "OR ||", "ID mot", "? ?", "ID z",
": :", "REGEX /x:3;x<5;y</g", "/ /", "ID i", ") )", "{ {",
"ID xyz", "( (", "ID x", "PLUSPLUS ++", ") )", "; ;", "} }"]
),
("""for (var x = a in foo && "</x>" || mot ? z/x:3;x<5;y</g/i) {xyz(x++);}""",
["FOR for", "( (", "VAR var", "ID x", "= =", "ID a", "IN in",
"ID foo", "AND &&", 'STRING "</x>"', "OR ||", "ID mot", "? ?", "ID z",
"/ /", "ID x", ": :", "NUMBER 3", "; ;", "ID x", "LT <", "NUMBER 5",
"; ;", "ID y", "LT <", "REGEX /g/i", ") )", "{ {",
"ID xyz", "( (", "ID x", "PLUSPLUS ++", ") )", "; ;", "} }"]
),

# Various "illegal" regexes that are valid according to the std.
(r"""/????/, /++++/, /[----]/ """,
["REGEX /????/", ", ,", "REGEX /++++/", ", ,", "REGEX /[----]/"]
),

# Stress cases from http://stackoverflow.com/questions/5533925/what-javascript-constructs-does-jslex-incorrectly-lex/5573409#5573409
(r"""/\[/""", [r"""REGEX /\[/"""]),
(r"""/[i]/""", [r"""REGEX /[i]/"""]),
(r"""/[\]]/""", [r"""REGEX /[\]]/"""]),
(r"""/a[\]]/""", [r"""REGEX /a[\]]/"""]),
(r"""/a[\]]b/""", [r"""REGEX /a[\]]b/"""]),
(r"""/[\]/]/gi""", [r"""REGEX /[\]/]/gi"""]),
(r"""/\[[^\]]+\]/gi""", [r"""REGEX /\[[^\]]+\]/gi"""]),
("""
rexl.re = {
NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
QUOTED_LITERAL: /^'(?:[^']|'')*'/,
NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
};
""",
["ID rexl", ". .", "ID re", "= =", "{ {",
"ID NAME", ": :",
r"""REGEX /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", ", ,",
"ID UNQUOTED_LITERAL", ": :",
r"""REGEX /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", ", ,",
"ID QUOTED_LITERAL", ": :", r"""REGEX /^'(?:[^']|'')*'/""", ", ,",
"ID NUMERIC_LITERAL", ": :",
r"""REGEX /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", ", ,",
"ID SYMBOL", ": :",
r"""REGEX /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
"} }", "; ;"]
),
("""
rexl.re = {
NAME: /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/,
UNQUOTED_LITERAL: /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/,
QUOTED_LITERAL: /^'(?:[^']|'')*'/,
NUMERIC_LITERAL: /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/,
SYMBOL: /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/
};
str = '"';
""",
["ID rexl", ". .", "ID re", "= =", "{ {",
"ID NAME", ": :", r"""REGEX /^(?!\d)(?:\w)+|^"(?:[^"]|"")+"/""", ", ,",
"ID UNQUOTED_LITERAL", ": :",
r"""REGEX /^@(?:(?!\d)(?:\w|\:)+|^"(?:[^"]|"")+")\[[^\]]+\]/""", ", ,",
"ID QUOTED_LITERAL", ": :", r"""REGEX /^'(?:[^']|'')*'/""", ", ,",
"ID NUMERIC_LITERAL", ": :",
r"""REGEX /^[0-9]+(?:\.[0-9]*(?:[eE][-+][0-9]+)?)?/""", ", ,",
"ID SYMBOL", ": :",
r"""REGEX /^(?:==|=|<>|<=|<|>=|>|!~~|!~|~~|~|!==|!=|!~=|!~|!|&|\||\.|\:|,|\(|\)|\[|\]|\{|\}|\?|\:|;|@|\^|\/\+|\/|\*|\+|-)/""",
"} }", "; ;",
"ID str", "= =", """STRING '"'""", "; ;",
]),
(r""" this._js = "e.str(\"" + this.value.replace(/\\/g, "\\\\").replace(/"/g, "\\\"") + "\")"; """,
["THIS this", ". .", "ID _js", "= =",
r'''STRING "e.str(\""''', "+ +", "THIS this", ". .",
"ID value", ". .", "ID replace", "( (", r"REGEX /\\/g", ", ,",
r'STRING "\\\\"', ") )", ". .", "ID replace", "( (", r'REGEX /"/g',
", ,", r'STRING "\\\""', ") )", "+ +", r'STRING "\")"', "; ;"]),
] # "


def make_test_function(input, expected):
Expand Down

0 comments on commit 7c3dcee

Please sign in to comment.