/
CorefDocMaker.java
252 lines (215 loc) · 9.31 KB
/
CorefDocMaker.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
package edu.stanford.nlp.coref;
import java.util.ArrayList;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.InputDoc;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader;
import edu.stanford.nlp.coref.docreader.DocReader;
import edu.stanford.nlp.coref.md.CorefMentionFinder;
import edu.stanford.nlp.coref.md.DependencyCorefMentionFinder;
import edu.stanford.nlp.coref.md.HybridCorefMentionFinder;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.TreeLemmatizer;
import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.logging.Redwood;
/**
*
* make Document for coref input from Annotation and optional info
* read input (raw, conll etc) with DocReader, mention detection, and document preprocessing will be done here
*
* @author heeyoung
*/
public class CorefDocMaker {
Properties props;
DocReader reader;
final HeadFinder headFinder;
CorefMentionFinder md;
Dictionaries dict;
StanfordCoreNLP corenlp;
final TreeLemmatizer treeLemmatizer;
LogisticClassifier<String, String> singletonPredictor;
boolean addMissingAnnotations ;
public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException {
this.props = props;
this.dict = dictionaries;
reader = getDocumentReader(props);
headFinder = getHeadFinder(props);
md = getMentionFinder(props, dictionaries, headFinder);
// corenlp = new StanfordCoreNLP(props, false);
// the property coref.addMissingAnnotations must be set to true to get the CorefDocMaker to add annotations
if (CorefProperties.addMissingAnnotations(props)) {
addMissingAnnotations = true;
corenlp = loadStanfordProcessor(props);
} else {
addMissingAnnotations = false;
}
treeLemmatizer = new TreeLemmatizer();
singletonPredictor = (CorefProperties.useSingletonPredictor(props))?
getSingletonPredictorFromSerializedFile(CorefProperties.getPathSingletonPredictor(props)) : null;
}
/** Load Stanford Processor: skip unnecessary annotator */
protected StanfordCoreNLP loadStanfordProcessor(Properties props) {
Properties pipelineProps = new Properties(props);
StringBuilder annoSb = new StringBuilder("");
if (!CorefProperties.useGoldPOS(props)) {
annoSb.append("pos, lemma");
} else {
annoSb.append("lemma");
}
if(CorefProperties.USE_TRUECASE) {
annoSb.append(", truecase");
}
if (!CorefProperties.useGoldNE(props) || CorefProperties.getLanguage(props)==Locale.CHINESE) {
annoSb.append(", ner");
}
if (!CorefProperties.useGoldParse(props)) {
if(CorefProperties.useConstituencyTree(props)) annoSb.append(", parse");
else annoSb.append(", depparse");
}
// need to add mentions
annoSb.append(", mention");
String annoStr = annoSb.toString();
Redwood.log("MentionExtractor ignores specified annotators, using annotators=" + annoStr);
pipelineProps.put("annotators", annoStr);
return new StanfordCoreNLP(pipelineProps, false);
}
private static DocReader getDocumentReader(Properties props) {
switch (CorefProperties.getInputType(props)) {
case CONLL:
String corpusPath = CorefProperties.getPathInput(props);
CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options();
options.annotateTokenCoref = false;
if (CorefProperties.useCoNLLAuto(props)) options.setFilter(".*_auto_conll$");
options.lang = CorefProperties.getLanguage(props);
return new CoNLLDocumentReader(corpusPath, options);
case ACE:
// TODO
return null;
case MUC:
// TODO
return null;
case RAW:
default: // default is raw text
// TODO
return null;
}
}
private static HeadFinder getHeadFinder(Properties props) {
Locale lang = CorefProperties.getLanguage(props);
if(lang == Locale.ENGLISH) return new SemanticHeadFinder();
else if(lang == Locale.CHINESE) return new ChineseSemanticHeadFinder();
else {
throw new RuntimeException("Invalid language setting: cannot load HeadFinder");
}
}
private static CorefMentionFinder getMentionFinder(Properties props, Dictionaries dictionaries, HeadFinder headFinder) throws ClassNotFoundException, IOException {
switch (CorefProperties.getMDType(props)) {
case RULE:
return new RuleBasedCorefMentionFinder(headFinder, props);
case HYBRID:
return new HybridCorefMentionFinder(headFinder, props);
case DEPENDENCY:
default: // default is dependency
return new DependencyCorefMentionFinder(props);
}
}
public Document makeDocument(Annotation anno) throws Exception {
return makeDocument(new InputDoc(anno, null, null));
}
/**
* Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)).
* Mention detection and document preprocessing is done here.
* @throws Exception
*/
public Document makeDocument(InputDoc input) throws Exception {
if (input == null) return null;
Annotation anno = input.annotation;
if (Boolean.parseBoolean(props.getProperty("coref.useMarkedDiscourse", "false"))) {
anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
}
// add missing annotation
if (addMissingAnnotations) {
addMissingAnnotation(anno);
}
// remove nested NP with same headword except newswire document for chinese
//if(input.conllDoc != null && CorefProperties.getLanguage(props)==Locale.CHINESE){
//CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw"));
//}
// each sentence should have a CorefCoreAnnotations.CorefMentionsAnnotation.class which maps to List<Mention>
// this is set by the mentions annotator
List<List<Mention>> mentions = new ArrayList<>() ;
for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) {
mentions.add(sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class));
}
Document doc = new Document(input, mentions);
// find headword for gold mentions
if(input.goldMentions!=null) findGoldMentionHeads(doc);
// document preprocessing: initialization (assign ID), mention processing (gender, number, type, etc), speaker extraction, etc
Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder);
return doc;
}
private void findGoldMentionHeads(Document doc) {
List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
for(int i=0 ; i<sentences.size() ; i++ ) {
// md.findHead(sentences.get(i), doc.goldMentions.get(i));
DependencyCorefMentionFinder.findHeadInDependency(sentences.get(i), doc.goldMentions.get(i));
}
}
private void addMissingAnnotation(Annotation anno) {
if (addMissingAnnotations) {
boolean useConstituency = CorefProperties.useConstituencyTree(props);
final boolean LEMMATIZE = true;
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
boolean hasTree = sentence.containsKey(TreeCoreAnnotations.TreeAnnotation.class);
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (!useConstituency) { // TODO: temp for dev: make sure we don't use constituency tree
sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
}
if (LEMMATIZE && hasTree && useConstituency) treeLemmatizer.transformTree(tree); // TODO don't need?
}
corenlp.annotate(anno);
} else {
throw new RuntimeException("Error: must set coref.addMissingAnnotations = true to call method addMissingAnnotation");
}
}
public void resetDocs() {
reader.reset();
}
public Document nextDoc() throws Exception {
InputDoc input = reader.nextDoc();
return (input == null)? null : makeDocument(input);
}
public static LogisticClassifier<String, String> getSingletonPredictorFromSerializedFile(String serializedFile) {
try {
ObjectInputStream ois = IOUtils.readStreamFromString(serializedFile);
Object o = ois.readObject();
if (o instanceof LogisticClassifier<?, ?>) {
return (LogisticClassifier<String, String>) o;
}
throw new ClassCastException("Wanted SingletonPredictor, got " + o.getClass());
} catch (IOException e) {
throw new RuntimeIOException(e);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
}