-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
AnnotatorImplementations.java
273 lines (238 loc) · 9.82 KB
/
AnnotatorImplementations.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.naturalli.NaturalLogicAnnotator;
import edu.stanford.nlp.naturalli.OpenIE;
import edu.stanford.nlp.util.MetaClass;
import edu.stanford.nlp.util.PropertiesUtils;
import java.io.IOException;
import java.util.*;
/**
* A class abstracting the implementation of various annotators.
* Importantly, subclasses of this class can overwrite the implementation
* of these annotators by returning a different annotator, and
* {@link edu.stanford.nlp.pipeline.StanfordCoreNLP} will automatically load
* the new annotator instead.
*
* @author Gabor Angeli
*/
public class AnnotatorImplementations {
/**
* Tokenize, emulating the Penn Treebank
*/
public Annotator tokenizer(Properties properties, boolean verbose, String options) {
return new TokenizerAnnotator(verbose, properties, options);
}
/**
* Clean XML input
*/
public CleanXmlAnnotator cleanXML(Properties properties,
String xmlTagsToRemove,
String sentenceEndingTags,
String dateTags,
boolean allowFlawedXml) {
return new CleanXmlAnnotator(xmlTagsToRemove, sentenceEndingTags, dateTags, allowFlawedXml);
}
/**
* Sentence split, in addition to a bunch of other things in this annotator (be careful to check the implementation!)
*/
public Annotator wordToSentences(Properties properties,
boolean verbose, String boundaryTokenRegex,
Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard,
String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
Set<String> tokenRegexesToDiscard) {
return new WordsToSentencesAnnotator(verbose, boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard,
newlineIsSentenceBreak, boundaryMultiTokenRegex, tokenRegexesToDiscard);
}
/**
* Part of speech tag
*/
public Annotator posTagger(Properties properties) {
String annotatorName = "pos";
return new POSTaggerAnnotator(annotatorName, properties);
}
/**
* Annotate lemmas
*/
public Annotator morpha(Properties properties, boolean verbose) {
return new MorphaAnnotator(verbose);
}
/**
* Annotate for named entities -- note that this combines multiple NER tag sets, and some auxiliary things (like temporal tagging)
*/
public Annotator ner(Properties properties) throws IOException {
List<String> models = new ArrayList<>();
String modelNames = properties.getProperty("ner.model");
if (modelNames == null) {
modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + "," + DefaultPaths.DEFAULT_NER_MUC_MODEL + "," + DefaultPaths.DEFAULT_NER_CONLL_MODEL;
}
if ( ! modelNames.isEmpty()) {
models.addAll(Arrays.asList(modelNames.split(",")));
}
if (models.isEmpty()) {
// Allow for no real NER model - can just use numeric classifiers or SUTime.
// Have to unset ner.model, so unlikely that people got here by accident.
System.err.println("WARNING: no NER models specified");
}
boolean applyNumericClassifiers =
PropertiesUtils.getBool(properties,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY,
NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
boolean useSUTime =
PropertiesUtils.getBool(properties,
NumberSequenceClassifier.USE_SUTIME_PROPERTY,
NumberSequenceClassifier.USE_SUTIME_DEFAULT);
boolean verbose = false;
String[] loadPaths = models.toArray(new String[models.size()]);
Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties,
NERClassifierCombiner.DEFAULT_PASS_DOWN_PROPERTIES);
if (useSUTime) {
// Make sure SUTime parameters are included
Properties sutimeProps = PropertiesUtils.extractPrefixedProperties(properties, NumberSequenceClassifier.SUTIME_PROPERTY + ".");
PropertiesUtils.overWriteProperties(combinerProperties, sutimeProps);
}
NERClassifierCombiner nerCombiner = new NERClassifierCombiner(applyNumericClassifiers,
useSUTime, combinerProperties, loadPaths);
int nThreads = PropertiesUtils.getInt(properties, "ner.nthreads", PropertiesUtils.getInt(properties, "nthreads", 1));
long maxTime = PropertiesUtils.getLong(properties, "ner.maxtime", 0);
int maxSentenceLength = PropertiesUtils.getInt(properties, "ner.maxlength", Integer.MAX_VALUE);
return new NERCombinerAnnotator(nerCombiner, verbose, nThreads, maxTime, maxSentenceLength);
}
/**
* Run RegexNER -- rule-based NER based on a deterministic mapping file
*/
public Annotator tokensRegexNER(Properties properties, String name) {
return new TokensRegexNERAnnotator(name, properties);
}
/**
* Annotate mentions
*/
public Annotator mentions(Properties properties, String name) {
return new EntityMentionsAnnotator(name, properties);
}
/**
* Annotate for gender of tokens
*/
public Annotator gender(Properties properties, boolean verbose) {
return new GenderAnnotator(false, properties.getProperty("gender.firstnames", DefaultPaths.DEFAULT_GENDER_FIRST_NAMES));
}
/**
* Annotate parse trees
*
* @param properties Properties that control the behavior of the parser. It use "parse.x" properties.
* @return A ParserAnnotator
*/
public Annotator parse(Properties properties) {
String parserType = properties.getProperty("parse.type", "stanford");
String maxLenStr = properties.getProperty("parse.maxlen");
if (parserType.equalsIgnoreCase("stanford")) {
return new ParserAnnotator("parse", properties);
} else if (parserType.equalsIgnoreCase("charniak")) {
String model = properties.getProperty("parse.model");
String parserExecutable = properties.getProperty("parse.executable");
if (model == null || parserExecutable == null) {
throw new RuntimeException("Both parse.model and parse.executable properties must be specified if parse.type=charniak");
}
int maxLen = 399;
if (maxLenStr != null) {
maxLen = Integer.parseInt(maxLenStr);
}
return new CharniakParserAnnotator(model, parserExecutable, false, maxLen);
} else {
throw new RuntimeException("Unknown parser type: " + parserType + " (currently supported: stanford and charniak)");
}
}
public Annotator custom(Properties properties, String property) {
String customName = property.substring(StanfordCoreNLP
.CUSTOM_ANNOTATOR_PREFIX.length());
String customClassName = properties.getProperty(property);
try {
// name + properties
return new MetaClass(customClassName).createInstance(customName, properties);
} catch (MetaClass.ConstructorNotFoundException e) {
try {
// name
return new MetaClass(customClassName).createInstance(customName);
} catch (MetaClass.ConstructorNotFoundException e2) {
// properties
try {
return new MetaClass(customClassName).createInstance(properties);
} catch (MetaClass.ConstructorNotFoundException e3) {
// empty arguments
return new MetaClass(customClassName).createInstance();
}
}
}
}
/**
* Infer the original casing of tokens
*/
public Annotator trueCase(Properties properties, String modelLoc,
String classBias,
String mixedCaseFileName,
boolean verbose) {
return new TrueCaseAnnotator(modelLoc, classBias, mixedCaseFileName, verbose);
}
/**
* Annotate for coreference (statistical)
*/
public Annotator coref(Properties properties) {
return new HybridCorefAnnotator(properties);
}
/**
* Annotate for coreference (deterministic)
*/
public Annotator dcoref(Properties properties) {
return new DeterministicCorefAnnotator(properties);
}
/**
* Annotate for relations expressed in sentences
*/
public Annotator relations(Properties properties) {
return new RelationExtractorAnnotator(properties);
}
/**
* Annotate for sentiment in sentences
*/
public Annotator sentiment(Properties properties, String name) {
return new SentimentAnnotator(name, properties);
}
/**
* Annotate dependency relations in sentences
*/
public Annotator dependencies(Properties properties) {
Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
Annotator.STANFORD_DEPENDENCIES + '.');
return new DependencyParseAnnotator(relevantProperties);
}
/**
* Annotate operators (e.g., quantifiers) and polarity of tokens in a sentence
*/
public Annotator natlog(Properties properties) {
Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
Annotator.STANFORD_NATLOG + '.');
return new NaturalLogicAnnotator(relevantProperties);
}
/**
* Annotate {@link edu.stanford.nlp.ie.util.RelationTriple}s from text.
*/
public Annotator openie(Properties properties) {
Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
Annotator.STANFORD_OPENIE + '.');
return new OpenIE(relevantProperties);
}
/**
* Annotate quotes and extract them like sentences
*/
public Annotator quote(Properties properties) {
Properties relevantProperties = PropertiesUtils.extractPrefixedProperties(properties,
Annotator.STANFORD_QUOTE + '.');
return new QuoteAnnotator(relevantProperties);
}
/**
* Add universal dependencies features
*/
public Annotator udfeats(Properties properties) {
return new UDFeatureAnnotator();
}
}