/
AntlrNameDictionary.java
219 lines (176 loc) · 7.06 KB
/
AntlrNameDictionary.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.ast.impl.antlr4;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Stream;
import org.antlr.v4.runtime.ParserRuleContext;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.Vocabulary;
import org.apache.commons.lang3.StringUtils;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
/**
* Stores the XPath name of antlr terminals. I found no simple way to
* give names to punctuation (we could add a lexer rule, but it may
* conflict with other tokens). So their names are hardcoded here.
*
* <p>Terminal names start with {@code "T-"} in XPath to avoid conflicts
* with other stuff.
*/
public class AntlrNameDictionary {
private final String[] terminalXPathNames;
private final String[] terminalImages;
private final String[] nonTermXpathNames;
private final Vocabulary vocabulary;
public AntlrNameDictionary(Vocabulary vocab, String[] ruleNames) {
this.vocabulary = vocab;
nonTermXpathNames = new String[ruleNames.length];
for (int i = 0; i < nonTermXpathNames.length; i++) {
nonTermXpathNames[i] = StringUtils.capitalize(ruleNames[i]);
}
Set<String> seen = new HashSet<>();
Collections.addAll(seen, ruleNames);
// terminal names
terminalXPathNames = new String[vocab.getMaxTokenType()];
terminalXPathNames[0] = "Invalid"; // See Token.INVALID_TYPE
terminalImages = new String[vocab.getMaxTokenType()];
terminalImages[0] = null;
for (int i = Token.MIN_USER_TOKEN_TYPE; i < terminalXPathNames.length; i++) {
String name = vocab.getSymbolicName(i);
String literalName = vocab.getLiteralName(i);
if (literalName != null) {
// cleanup literal name, Antlr surrounds the image with single quotes
literalName = literalName.substring(1, literalName.length() - 1);
terminalImages[i] = literalName;
}
if (name == null && literalName != null) {
name = literalName;
if (!name.matches("[a-zA-Z][\\w_-]+")) { // not alphanum
name = nonAlphaNumName(name);
} // otherwise something like "final"
}
assert name != null : "Token of kind " + i + " has no XPath name (literal " + vocab.getLiteralName(i) + ")";
String finalName = "T-" + name;
assert finalName.matches("[a-zA-Z][\\w_-]+") : "Not a valid XPath name " + finalName;
assert seen.add(finalName) : "Duplicate XPath name " + finalName;
terminalXPathNames[i] = finalName;
}
assert Stream.of(terminalXPathNames).distinct().count() == terminalXPathNames.length
: "Duplicate names in " + Arrays.toString(terminalXPathNames);
}
public Vocabulary getVocabulary() {
return vocabulary;
}
/**
* Override this to customize the XPath name of tokes with no symbolic
* name and with an image that is non-alphanumeric. Return null to give
* up. The default just gives some name to common punctuation. Remember
* that the same token may mean several things in different contexts, so
* eg using {@code "not"} as the name of {@code "!"} is too specific.
*/
protected @Nullable String nonAlphaNumName(String name) {
switch (name) {
case "!": return "bang";
case "!!": return "double-bang";
case "?": return "question";
case "??": return "double-question";
case "?:": return "elvis";
case "?.": return "question-dot";
case ":": return "colon";
case ";": return "semi";
case ",": return "comma";
case "(": return "lparen";
case ")": return "rparen";
case "[": return "lbracket";
case "]": return "rbracket";
case "{": return "lbrace";
case "}": return "rbrace";
case "_": return "underscore";
case ".": return "dot";
case "..": return "double-dot";
case "...": return "ellipsis";
case "@": return "at-symbol";
case "$": return "dollar";
case "\\": return "backslash";
case "/": return "slash";
case "//": return "double-slash";
case "`": return "backtick";
case "'": return "squote";
case "\"": return "dquote";
case "\"\"\"": return "triple-quote";
case ">": return "gt";
case ">=": return "ge";
case "<": return "lt";
case "<=": return "le";
case ">>": return "double-gt";
case "<<": return "double-lt";
case ">>>": return "triple-gt";
case "<<<": return "triple-lt";
case "=": return "eq";
case "==": return "double-eq";
case "===": return "triple-eq";
case "!=": return "not-eq";
case "&": return "amp";
case "&&": return "double-amp";
case "|": return "pipe";
case "||": return "double-pipe";
case "*": return "star";
case "**": return "double-star";
case "+": return "plus";
case "++": return "double-plus";
case "-": return "minus";
case "--": return "double-minus";
case "->": return "rarrow";
case "<-": return "larrow";
default:
return null;
}
}
/**
* Gets the xpath name of a terminal node with a given {@link Token#getType()}.
*
* @throws IllegalArgumentException If the index is invalid
*/
public @NonNull String getXPathNameOfToken(int tokenType) {
if (tokenType >= 0 && tokenType < terminalXPathNames.length) {
return terminalXPathNames[tokenType];
}
if (tokenType == Token.EOF) {
return "EOF";
}
throw new IllegalArgumentException("I don't know token type " + tokenType);
}
/**
* Returns the constant image of the given token (a shared string),
* or null if the token has none. This is a memory optimization to
* avoid creating a new string for tokens with constant images. Antlr
* does not do this by itself sadly.
*/
public @Nullable String getConstantImageOfToken(Token token) {
int tokenType = token.getType();
if (tokenType >= 0 && tokenType < terminalXPathNames.length) {
return terminalImages[tokenType];
} else if (token.getStartIndex() == token.getStopIndex()) {
return "";
}
return null;
}
/**
* Gets the xpath name of an inner node with a given {@link ParserRuleContext#getRuleIndex()}.
*
* @throws IndexOutOfBoundsException If the index is invalid
*/
public @NonNull String getXPathNameOfRule(int idx) {
return nonTermXpathNames[idx];
}
public int getMaxRuleIndex() {
return nonTermXpathNames.length;
}
public int getMaxTokenType() {
return vocabulary.getMaxTokenType();
}
}