-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
ChineseSegmenterAnnotator.java
370 lines (328 loc) · 15 KB
/
ChineseSegmenterAnnotator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
package edu.stanford.nlp.pipeline;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;
/**
* This class will add segmentation information to an Annotation.
* It assumes that the original document is a List of sentences under the
* {@code SentencesAnnotation.class} key, and that each sentence has a
* {@code TextAnnotation.class key}. This Annotator adds corresponding
* information under a {@code CharactersAnnotation.class} key prior to segmentation,
* and a {@code TokensAnnotation.class} key with value of a List of CoreLabel
* after segmentation.
*
* @author Pi-Chuan Chang
*/
public class ChineseSegmenterAnnotator implements Annotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseSegmenterAnnotator.class);
private static final String DEFAULT_MODEL_NAME = "segment";
private static final String DEFAULT_SEG_LOC =
"/u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/ctb7.chris6.lex.gz";
private static final String DEFAULT_SER_DICTIONARY =
"//u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/dict-chris6.ser.gz";
private static final String DEFAULT_SIGHAN_CORPORA_DICT =
"/u/nlp/data/chinese-segmenter/stanford-seg-2010/releasedata/";
private static final String separator = "(?:\r|\r?\n|" + System.lineSeparator() + ')';
private static final Pattern separatorPattern = Pattern.compile(separator);
private final AbstractSequenceClassifier<?> segmenter;
private final boolean VERBOSE;
private final boolean tokenizeNewline;
private final boolean normalizeSpace;
public ChineseSegmenterAnnotator() {
this(DEFAULT_SEG_LOC, false);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) {
this(DEFAULT_MODEL_NAME,
PropertiesUtils.asProperties(
DEFAULT_MODEL_NAME + ".serDictionary", serDictionary,
DEFAULT_MODEL_NAME + ".sighanCorporaDict", sighanCorporaDict,
DEFAULT_MODEL_NAME + ".verbose", Boolean.toString(verbose),
DEFAULT_MODEL_NAME + ".model", segLoc));
}
public ChineseSegmenterAnnotator(String name, Properties props) {
String model = null;
// Keep only the properties that apply to this annotator
Properties modelProps = new Properties();
String desiredKey = name + '.';
for (String key : props.stringPropertyNames()) {
if (key.startsWith(desiredKey)) {
// skip past name and the subsequent "."
String modelKey = key.substring(desiredKey.length());
if (modelKey.equals("model")) {
model = props.getProperty(key);
} else {
modelProps.setProperty(modelKey, props.getProperty(key));
}
}
}
this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false);
this.normalizeSpace = PropertiesUtils.getBool(props, name + ".normalizeSpace", false);
if (model == null) {
throw new RuntimeException("Expected a property " + name + ".model");
}
// don't write very much, because the CRFClassifier already reports loading
if (VERBOSE) {
log.info("Loading Segmentation Model ... ");
}
try {
segmenter = CRFClassifier.getClassifier(model, modelProps);
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
// If newlines are treated as sentence split, we need to retain them in tokenization for ssplit to make use of them
tokenizeNewline = (!props.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "never").equals("never"))
|| Boolean.valueOf(props.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Adding Segmentation annotation ... ");
}
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences != null) {
for (CoreMap sentence : sentences) {
doOneSentence(sentence);
}
} else {
doOneSentence(annotation);
}
}
private void doOneSentence(CoreMap annotation) {
splitCharacters(annotation);
runSegmentation(annotation);
}
/** This is based on the "SGML2" pattern from PTBLexer.flex. */
private static final Pattern xmlPattern =
Pattern.compile("<([!?][A-Za-z-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:.-]*([ ]+([A-Za-z][A-Za-z0-9_:.-]*|[A-Za-z][A-Za-z0-9_:.-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z][A-Za-z0-9_:.-]*)))*[ ]*/?|/[A-Za-z][A-Za-z0-9_:.-]*)[ ]*>");
/** This gets the TextAnnotation and creates a CharactersAnnotation, where, roughly,
* the text has been separated into one character non-whitespace tokens with ChineseCharAnnotation, and with
* a ChineseSegAnnotation marking the ones after whitespace, so that there will definitely
* be word segmentation there. In 2016, two improvements were added: Handling non-BMP characters
* correctly and not splitting on whitespace in the same types of XML places that are recognized by
* English PTBTokenizer.
*
* @param annotation The annotation to process
*/
private void splitCharacters(CoreMap annotation) {
String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
boolean seg = true; // false only while inside an XML entity
List<CoreLabel> charTokens = new ArrayList<>();
int length = origText.length();
int xmlStartOffset = Integer.MAX_VALUE;
int xmlEndOffset = -1;
Matcher m = xmlPattern.matcher(origText);
if (m.find()) {
xmlStartOffset = m.start();
xmlEndOffset = m.end();
}
for (int offset = 0, cpCharCount; offset < length; offset += cpCharCount) {
int cp = origText.codePointAt(offset);
cpCharCount = Character.charCount(cp);
CoreLabel wi = new CoreLabel();
String charString = origText.substring(offset, offset + cpCharCount); // new Java 8 substring, don't need to copy.
if (offset == xmlEndOffset) {
// reset with another search
m = xmlPattern.matcher(origText);
if (m.find(offset)) {
xmlStartOffset = m.start();
xmlEndOffset = m.end();
}
}
boolean skipCharacter = false;
boolean isXMLCharacter = false;
// first two cases are for XML region
if (offset == xmlStartOffset) {
seg = true;
isXMLCharacter = true;
} else if (offset > xmlStartOffset && offset < xmlEndOffset) {
seg = false;
isXMLCharacter = true;
} else if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
// if this word is a whitespace or a control character, set 'seg' to true for next character
seg = true;
// Don't skip newline characters if we're tokenizing them
// We always count \n as newline to be consistent with the implementation of ssplit
skipCharacter = ! (tokenizeNewline && (cp == '\n' || cp == '\r' || System.lineSeparator().contains(charString)));
}
if ( ! skipCharacter) {
// if this character is a normal character, put it in as a CoreLabel and set seg to false for next word
wi.set(CoreAnnotations.ChineseCharAnnotation.class, charString);
if (seg) {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
} else {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
}
if (isXMLCharacter) {
if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
// We mark XML whitespace with a special tag because later they will be handled differently
// than non-whitespace XML characters. This is because the segmenter eats whitespaces...
wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "whitespace");
} else if (offset == xmlStartOffset) {
wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "beginning");
} else {
wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "1");
}
} else {
wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "0");
}
wi.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
wi.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, (offset + cpCharCount));
charTokens.add(wi);
seg = false;
}
} // for loop through charPoints
annotation.set(SegmenterCoreAnnotations.CharactersAnnotation.class, charTokens);
}
private void runSegmentation(CoreMap annotation) {
//0 2
// A BC D E
// 1 10 1 1
// 0 12 3 4
// 0, 0+1 ,
String text = annotation.get(CoreAnnotations.TextAnnotation.class); // the original text String
List<CoreLabel> sentChars = annotation.get(SegmenterCoreAnnotations.CharactersAnnotation.class); // the way it was divided by splitCharacters
List<CoreLabel> tokens = new ArrayList<>();
annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
// Run the segmenter! On the whole String. It knows not about the splitting into chars.
// Can we change this to have it run directly on the already existing list of tokens. That would help, no?
List<String> words;
if ( ! tokenizeNewline) {
text = text.replaceAll("[\r\n]", "");
words = segmenter.segmentString(text);
} else {
// Run the segmenter on each line so that we don't get tokens that cross line boundaries
// Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432
String[] lines = text.split(String.format("((?<=%1$s)|(?=%1$s))", separator));
words = new ArrayList<>();
for (String line : lines) {
if (separatorPattern.matcher(line).matches()) {
// Don't segment newline tokens, keep them as-is
words.add(line);
} else {
words.addAll(segmenter.segmentString(line));
}
}
}
if (VERBOSE) {
log.info(text + "\n--->\n" + words);
}
// Go through everything again and make the final tokens list
int pos = 0; // This is used to index sentChars, the output from splitCharacters
StringBuilder xmlBuffer = new StringBuilder();
int xmlBegin = -1;
for (String w : words) {
CoreLabel fl = sentChars.get(pos);
if (fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class).equals("0")
|| fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class).equals("beginning")) {
// Beginnings of plain text and other XML tags are good places to end an XML tag
if (xmlBuffer.length() > 0) {
// Form the XML token
String xmlTag = xmlBuffer.toString();
CoreLabel fl1 = sentChars.get(pos - 1);
int end = fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
tokens.add(makeXmlToken(xmlTag, true, xmlBegin, end));
// Clean up and prepare for the next XML tag
xmlBegin = -1;
xmlBuffer = new StringBuilder();
}
}
if ( ! fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class).equals("0")) {
// found an XML character
while (fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class).equals("whitespace")) {
// Print whitespaces into the XML buffer and move on until the next non-whitespace character is found
// and we're in sync with segmenter output again
xmlBuffer.append(' ');
pos += 1;
fl = sentChars.get(pos);
}
xmlBuffer.append(w);
pos += w.length();
if (xmlBegin < 0) {
xmlBegin = fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
}
continue;
}
fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
if (w.isEmpty()) {
if (VERBOSE) { log.warn("Encountered an empty word. Shouldn't happen?"); }
continue; // [cdm 2016:] surely this shouldn't happen!
}
int begin = fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
pos += w.length();
if (pos - 1 >= sentChars.size()) {
log.error("on word " + w + " at position " + (pos - w.length()) + " trying to get at position " + (pos - 1));
log.error("last element of sentChars is " + sentChars.get(sentChars.size() - 1));
}
fl = sentChars.get(pos - 1);
int end = fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
tokens.add(makeXmlToken(w, false, begin, end));
}
if (xmlBuffer.length() > 0) {
// Form the last XML token, if any
String xmlTag = xmlBuffer.toString();
CoreLabel fl1 = sentChars.get(pos - 1);
int end = fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
tokens.add(makeXmlToken(xmlTag, true, xmlBegin, end));
}
if (VERBOSE) {
for (CoreLabel token : tokens) {
log.info(token.toShorterString());
}
}
}
private CoreLabel makeXmlToken(String tokenText, boolean doNormalization, int charOffsetBegin, int charOffsetEnd) {
CoreLabel token = new CoreLabel();
token.setOriginalText(tokenText);
if (separatorPattern.matcher(tokenText).matches()) {
// Map to CoreNLP newline token
tokenText = AbstractTokenizer.NEWLINE_TOKEN;
} else if (doNormalization && normalizeSpace) {
tokenText = tokenText.replace(' ', '\u00A0'); // change space to non-breaking space
}
token.setWord(tokenText);
token.setValue(tokenText);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charOffsetBegin);
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charOffsetEnd);
if (VERBOSE) {
log.info("Adding token " + token.toShorterString());
}
return token;
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.emptySet();
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.PositionAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.ValueAnnotation.class
));
}
}