-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
parser.py
223 lines (185 loc) · 7.29 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
#
# Copyright © Spyder Project Contributors
# Licensed under the terms of the MIT License
# (see spyder/__init__.py for details)
"""
LL(1) predictive parser for snippets CFG grammar.
Aho, Sethi, Ullman
Compilers: Principles, Techniques, and Tools, Addison-Wesley, 1986
"""
# The following grammar extracts snippets from a given text, regardless of the
# programming language used.
GRAMMAR = """
START -> EXPRLIST
EXPRLIST -> EXPR MOREEXPR
MOREEXPR -> EXPR MOREEXPR | EPSILON
EXPR -> SNIPPET | ANY
TEXT_SNIPPETS -> MOREEXPR
TEXT -> ANY MOREANY
MOREANY -> ANY MOREANY | EPSILON
TEXT_NO_COL -> ANY_NO_COL MORE_ANY_NO_COL
MORE_ANY_NO_COL -> ANY_NO_COL MORE_ANY_NO_COL | EPSILON
TEXT_COL -> ANY_COL MORE_COL
MORE_COL -> ANY_COL MORE_COL | EPSILON
TEXT_REGEX -> ANY_REGEX MORE_REGEX
MORE_REGEX -> ANY_REGEX MORE_REGEX | EPSILON
TEXT_FORMAT -> ANY FOLLOW_ANY
FOLLOW_ANY -> ANY FOLLOW_ANY | EPSILON
ANY_REGEX -> ANY | dollar
ANY -> ANY_COL | ,
ANY_COL -> ANY_NO_COL | COLONS
ANY_NO_COL -> name | int | case | SYMBOLS | whitespace | left_curly_name
COLONS -> : | :+ | :- | :?
SYMBOLS -> \\: | \\$ | text_pipe | { | \\} | \\ | \\/ | \\, | symbol
SNIPPET -> dollar SNIPPETBODY
SNIPPETBODY -> SNIPPETID | INTSNIPPET | VARSNIPPET
INTSNIPPET -> int | { int INTSNIPPETBODY }
INTSNIPPETBODY -> COLONBODY | PIPEBODY | EPSILON
COLONBODY -> : TEXT_SNIPPETS
PIPEBODY -> pipe TEXTSEQ pipe
TEXTSEQ -> TEXT_COL MORETEXT
MORETEXT -> , TEXT_COL MORETEXT | EPSILON
VARSNIPPET -> left_curly_name VARSNIPPETBODY } | name
VARSNIPPETBODY -> COLONBODY | REGEXBODY | EPSILON
REGEXBODY -> / REGEX / FORMATTEXT / OPTIONS
REGEX -> TEXT_REGEX | EPSILON
OPTIONS -> TEXT_REGEX | EPSILON
FORMATTEXT -> FORMAT MOREFORMAT
MOREFORMAT -> FORMAT MOREFORMAT | EPSILON
FORMAT -> FORMATEXPR | TEXT_FORMAT
FORMATTEXT_NO_COL -> FORMAT_NO_COL MOREFORMAT_NO_COL
MOREFORMAT_NO_COL -> FORMAT_NO_COL MOREFORMAT_NO_COL | EPSILON
FORMAT_NO_COL -> FORMATEXPR | TEXT_NO_COL
FORMATEXPR -> dollar FORMATBODY
FORMATBODY -> int | { int FORMATOPTIONS }
FORMATOPTIONS -> CASEOPTS | IFOPTS | IFELSEOPTS | ELSEOPTS | EPSILON
CASEOPTS -> : FORMATTEXT
IFOPTS -> :+ FORMATTEXT
IFELSEOPTS -> :? FORMATTEXT_NO_COL : FORMATTEXT
ELSEOPTS -> PROLOGFUNCT FORMATTEXT
PROLOGFUNCT -> : | :-
"""
def _preprocess_grammar(grammar):
grammar_lines = grammar.strip().splitlines()
grammar_lines = [line.strip() for line in grammar_lines if line != '']
grammar = {}
for line in grammar_lines:
production_name, rules = line.split(' -> ')
rules = rules.split(' | ')
productions = []
for rule in rules:
rule_parts = rule.strip().split()
productions.append(rule_parts)
grammar[production_name] = productions
return grammar
def create_LL1_parsing_table(grammar=GRAMMAR, starting_rule='START'):
"""Create LL(1) parsing table for a given grammar."""
grammar = _preprocess_grammar(grammar)
fne = first_no_epsilon(grammar)
first = {}
for rule in fne:
first[rule] = list(set([i[1] for i in fne[rule]]))
follow_rules = follow(grammar, first, starting_rule)
parse_table = {}
for rule in fne:
parse_table[rule] = {}
for _, sym, production in fne[rule]:
if sym != 'EPSILON':
parse_table[rule][sym] = production
else:
for follow_sym in follow_rules[rule]:
parse_table[rule][follow_sym] = []
return grammar, fne, follow_rules, parse_table
def first_no_epsilon(grammar):
"""Compute FIRST sets for all grammar rules."""
fne = {}
for rule in grammar:
fne = first(grammar, rule, fne)
return fne
def first(grammar, rule, fne):
"""
Compute FIRST set for a given rule.
The first set of a sequence of symbols u, written as First(u) is the set
of terminals which start the sequences of symbols derivable from u.
A bit more formally, consider all strings derivable from u. If u =>* v,
where v begins with some terminal, that terminal is in First(u).
If u =>* epsilon, then epsilon is in First(u).
"""
first_set = []
if rule not in fne:
for productions in grammar[rule]:
epsilon_found = True
for production in productions:
if production not in grammar:
# Terminal symbol
first_set.append((rule, production, productions))
epsilon_found = False
break
else:
# Further productions
first_production = productions[0]
fne = first(grammar, production, fne)
num_epsilon = 0
for _, sym, _ in fne[first_production]:
if sym != 'EPSILON':
first_set.append((rule, sym, productions))
else:
num_epsilon += 1
if num_epsilon == 0:
epsilon_found = False
break
if epsilon_found:
first_set.append((rule, 'EPSILON', productions))
fne[rule] = first_set
return fne
def follow(grammar, fne, starting_rule):
"""Compute FOLLOW sets for all grammar rules."""
follow = {}
position = {}
follow[starting_rule] = ['<eof>']
for rule1 in grammar:
rule1_position = []
for rule2 in grammar:
if rule1 == rule2:
continue
for i, productions in enumerate(grammar[rule2]):
for j, production in enumerate(productions):
if production == rule1:
rule1_position.append((rule2, i, j))
position[rule1] = rule1_position
for rule in grammar:
follow = _follow(grammar, fne, rule, follow, position, starting_rule)
return follow
def _follow(grammar, fne, rule, follow, position, starting_rule):
"""
Compute FOLLOW set for a grammar rule.
The follow set of a nonterminal A is the set of terminal symbols that can
appear immediately to the right of A in a valid sentence.
A bit more formally, for every valid sentence S =>* uAv, where v begins
with some terminal, and that terminal is in Follow(A).
"""
rule_follow = []
if rule not in follow or rule == starting_rule:
if rule == starting_rule:
rule_follow = follow[rule]
for derived_rule, i, j in position[rule]:
production = grammar[derived_rule][i]
if j < len(production) - 1:
next_rule = production[j + 1]
if next_rule in grammar:
next_rule_first = [x for x in fne[next_rule]
if x != 'EPSILON']
rule_follow += next_rule_first
if 'EPSILON' in fne[next_rule]:
follow = _follow(grammar, fne, derived_rule,
follow, position, starting_rule)
rule_follow += follow[derived_rule]
else:
rule_follow.append(next_rule)
else:
follow = _follow(grammar, fne, derived_rule,
follow, position, starting_rule)
rule_follow += follow[derived_rule]
follow[rule] = list(set(rule_follow))
return follow