-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
WordToSentenceProcessor.java
619 lines (568 loc) · 29.4 KB
/
WordToSentenceProcessor.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
package edu.stanford.nlp.process;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Transforms a List of words into a List of Lists of words (that is, a List
* of sentences), by grouping the words. The word stream is assumed to
* already be adequately tokenized, and this class just divides the List into
* sentences, perhaps discarding some separator tokens as it goes.
* <p>
* The main behavior is to look for sentence ending tokens like "." or "?!?",
* and to split after them and any following sentence closers like ")".
* Overlaid on this is an overall choice of state: The WordToSentenceProcessor
* can be a non-splitter, which always returns one sentence. Otherwise, the
* WordToSentenceProcessor will also split based on paragraphs using one of
* these three states: (1) Ignore line breaks in splitting sentences,
* (2) Treat each line as a separate paragraph, or (3) Treat two consecutive
* line breaks as marking the end of a paragraph. The details of sentence
* breaking within paragraphs is controlled based on the following three
* variables:
* <ul>
* <li>sentenceBoundaryTokens are tokens that are left in a sentence, but are
* to be regarded as ending a sentence. A canonical example is a period.
* If two of these follow each other, the second will be a sentence
* consisting of only the sentenceBoundaryToken.
* <li>sentenceBoundaryFollowers are tokens that are left in a sentence, and
* which can follow a sentenceBoundaryToken while still belonging to
* the previous sentence. They cannot begin a sentence (except at the
* beginning of a document). A canonical example is a close parenthesis
* ')'.
* <li>sentenceBoundaryToDiscard are tokens which separate sentences and
* which should be thrown away. In web documents, a typical example would
* be a '{@code <p>}' tag. If two of these follow each other, they are
* coalesced: no empty Sentence is output. The end-of-file is not
* represented in this Set, but the code behaves as if it were a member.
* <li>regionElementRegex A regular expression for element names containing
* a sentence region. Only tokens in such elements will be included in
* sentences. The start and end tags themselves are not included in the
* sentence.
* </ul>
*
* Instances of this class are now immutable. ☺
*
* @author Joseph Smarr (jsmarr@stanford.edu)
* @author Christopher Manning
* @author Teg Grenager (grenager@stanford.edu)
* @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
*
* @param <IN> The type of the tokens in the sentences
*/
public class WordToSentenceProcessor<IN> implements ListProcessor<IN, List<IN>> {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(WordToSentenceProcessor.class);
/** Turning this on is good for debugging sentence splitting. */
private static final boolean DEBUG = false;
// todo [cdm Aug 2012]: This should be unified with the PlainTextIterator
// in DocumentPreprocessor, perhaps by making this one implement Iterator.
// (DocumentProcessor once used to use this class, but now doesn't....)
public enum NewlineIsSentenceBreak { NEVER, ALWAYS, TWO_CONSECUTIVE }
public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+";
/** Pe = Close_Punctuation (close brackets), Pf = Final_Punctuation (close quotes);
* add straight quotes, PTB escaped right brackets (-RRB-, etc.), greater than as close angle bracket,
* and those forms in full width range.
*/
public static final String DEFAULT_BOUNDARY_FOLLOWERS_REGEX = "[\\p{Pe}\\p{Pf}\"'>"'>]|''|-R[CRS]B-";
public static final Set<String> DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(
Generics.newHashSet(Arrays.asList(WhitespaceLexer.NEWLINE, PTBTokenizer.getNewlineToken())));
/**
* Regex for tokens (Strings) that qualify as sentence-final tokens.
*/
private final Pattern sentenceBoundaryTokenPattern;
/**
* Regex for multi token sequences that qualify as sentence-final tokens.
* (i.e. use if you want to sentence split on 2 or more newlines)
*/
private final SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern;
/**
* Regex for tokens (Strings) that qualify as tokens that can follow
* what normally counts as an end of sentence token, and which are
* attributed to the preceding sentence. For example ")" coming after
* a period.
*/
private final Pattern sentenceBoundaryFollowersPattern;
/**
* List of regex Pattern that are sentence boundaries to be discarded.
* This is normally newline tokens or representations of them.
*/
private final Set<String> sentenceBoundaryToDiscard;
/** Patterns that match the start and end tags of XML elements. These will
* be discarded, but taken to mark a sentence boundary.
* The value will be null if there are no such elements being used
* (for efficiency).
*/
private final List<Pattern> xmlBreakElementsToDiscard;
/**
* List of regex Patterns that are not to be treated as sentence boundaries but should be discarded
* (i.e. these may have been used with context to identify sentence boundaries but are not needed any more)
*/
private final List<Pattern> tokenPatternsToDiscard;
private final Pattern sentenceRegionBeginPattern;
private final Pattern sentenceRegionEndPattern;
private final NewlineIsSentenceBreak newlineIsSentenceBreak;
private final boolean isOneSentence;
private final boolean allowEmptySentences;
public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String name) {
if ("always".equals(name)) {
return NewlineIsSentenceBreak.ALWAYS;
} else if ("never".equals(name)) {
return NewlineIsSentenceBreak.NEVER;
} else if (name != null && name.contains("two")) {
return NewlineIsSentenceBreak.TWO_CONSECUTIVE;
} else {
throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name: '" + name + "' (should be one of 'always', 'never', 'two')");
}
}
/** This is a sort of hacked in other way to end sentences.
* Tokens with the ForcedSentenceEndAnnotation set to true
* will also end a sentence.
*/
@SuppressWarnings("OverlyStrongTypeCast")
private static boolean isForcedEndToken(Object o) {
if (o instanceof CoreMap) {
Boolean forcedEndValue =
((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
return forcedEndValue != null && forcedEndValue;
} else {
return false;
}
}
@SuppressWarnings("OverlyStrongTypeCast")
private static String getString(Object o) {
if (o instanceof HasWord) {
HasWord h = (HasWord) o;
return h.word();
} else if (o instanceof String) {
return (String) o;
} else if (o instanceof CoreMap) {
return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class);
} else {
throw new RuntimeException("Expected token to be either Word or String.");
}
}
@SuppressWarnings("Convert2streamapi")
private static boolean matches(List<Pattern> patterns, String word) {
for (Pattern p: patterns) {
Matcher m = p.matcher(word);
if (m.matches()) {
return true;
}
}
return false;
}
private boolean matchesXmlBreakElementToDiscard(String word) {
return matches(xmlBreakElementsToDiscard, word);
}
private boolean matchesTokenPatternsToDiscard(String word) {
return matches(tokenPatternsToDiscard, word);
}
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String).
* @return A list of sentences.
* @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
*/
// todo [cdm 2016]: Should really sort out generics here so don't need to have extra list copying
@Override
public List<List<IN>> process(List<? extends IN> words) {
if (isOneSentence) {
// put all the words in one sentence
List<List<IN>> sentences = Generics.newArrayList();
sentences.add(new ArrayList<>(words));
return sentences;
} else {
return wordsToSentences(words);
}
}
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String).
* @return A list of sentences.
* @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
*/
@SuppressWarnings("ConstantConditions")
private List<List<IN>> wordsToSentences(List<? extends IN> words) {
IdentityHashMap<Object, Boolean> isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern
if (DEBUG) { log.info("Cutting up: " + words); }
if (sentenceBoundaryMultiTokenPattern != null) {
if (DEBUG) { log.info(" checking for tokensregex pattern: " + sentenceBoundaryMultiTokenPattern); }
// Do initial pass using TokensRegex to identify multi token patterns that need to be matched
// and add the last token of a match to our table of sentence boundary tokens.
isSentenceBoundary = new IdentityHashMap<>();
SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
while (matcher.find()) {
List<? super IN> nodes = matcher.groupNodes();
if (nodes != null && ! nodes.isEmpty()) {
if (DEBUG) { log.info(" found match at: " + nodes); }
isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
}
}
}
// Split tokens into sentences!!!
List<List<IN>> sentences = Generics.newArrayList();
List<IN> currentSentence = new ArrayList<>();
List<IN> lastSentence = null;
boolean insideRegion = false;
boolean inWaitForForcedEnd = false;
boolean lastTokenWasNewline = false;
boolean lastSentenceEndForced = false;
for (IN o: words) {
String word = getString(o);
boolean forcedEnd = isForcedEndToken(o);
// if (DEBUG) { if (forcedEnd) { log.info("Word is " + word + "; marks forced end of sentence [cont.]"); } }
boolean inMultiTokenExpr = false;
boolean discardToken = false;
if (o instanceof CoreMap) {
// Hacky stuff to ensure sentence breaks do not happen in certain cases
CoreMap cm = (CoreMap) o;
if ( ! forcedEnd) {
Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
if (forcedUntilEndValue != null && forcedUntilEndValue) {
// if (DEBUG) { log.info("Word is " + word + "; starting wait for forced end of sentence [cont.]"); }
inWaitForForcedEnd = true;
} else {
MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
if (mt != null && ! mt.isEnd()) {
// In the middle of a multi token mention, make sure sentence is not ended here
// if (DEBUG) { log.info("Word is " + word + "; inside multi-token mention [cont.]"); }
inMultiTokenExpr = true;
}
}
}
}
if (tokenPatternsToDiscard != null) {
discardToken = matchesTokenPatternsToDiscard(word);
}
if (sentenceRegionBeginPattern != null && ! insideRegion) {
if (DEBUG) { log.info("Word is " + word + "; outside region; deleted"); }
if (sentenceRegionBeginPattern.matcher(word).matches()) {
insideRegion = true;
if (DEBUG) { log.info(" entering region"); }
}
lastTokenWasNewline = false;
continue;
}
if ( ! lastSentenceEndForced && lastSentence != null && currentSentence.isEmpty() &&
! lastTokenWasNewline && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
if ( ! discardToken) {
lastSentence.add(o);
}
if (DEBUG) {
log.info("Word is " + word + (discardToken ? "discarded":" added to last sentence"));
}
lastTokenWasNewline = false;
continue;
}
boolean newSentForced = false;
boolean newSent = false;
String debugText = (discardToken)? "discarded": "added to current";
if (inWaitForForcedEnd && ! forcedEnd) {
if (sentenceBoundaryToDiscard.contains(word)) {
// there can be newlines even in something to keep together
discardToken = true;
}
if ( ! discardToken) currentSentence.add(o);
if (DEBUG) { log.info("Word is " + word + "; in wait for forced end; " + debugText); }
} else if (inMultiTokenExpr && ! forcedEnd) {
if ( ! discardToken) currentSentence.add(o);
if (DEBUG) { log.info("Word is " + word + "; in multi token expr; " + debugText); }
} else if (sentenceBoundaryToDiscard.contains(word)) {
if (forcedEnd) {
// sentence boundary can easily be forced end
inWaitForForcedEnd = false;
newSentForced = true;
} else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
newSentForced = true;
} else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE && lastTokenWasNewline) {
newSentForced = true;
}
lastTokenWasNewline = true;
if (DEBUG) {
log.info("Word is " + word + "; a discarded sentence boundary; newSentForced=" + newSentForced);
}
} else {
lastTokenWasNewline = false;
Boolean isb;
if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
newSentForced = true;
if (DEBUG) { log.info("Word is " + word + "; is XML break element; discarded"); }
} else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
insideRegion = false;
newSentForced = true;
// Marked sentence boundaries
} else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) {
if (!discardToken) currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText);
}
newSent = true;
} else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
if ( ! discardToken) { currentSentence.add(o); }
if (DEBUG) { log.info("Word is " + word + "; is sentence boundary; " + debugText); }
newSent = true;
} else if (forcedEnd) {
if ( ! discardToken) { currentSentence.add(o); }
inWaitForForcedEnd = false;
newSentForced = true;
if (DEBUG) { log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText); }
} else {
if ( ! discardToken) currentSentence.add(o);
// chris added this next test in 2017; a bit weird, but KBP setup doesn't have newline in sentenceBoundary patterns, just in toDiscard
if (AbstractTokenizer.NEWLINE_TOKEN.equals(word)) {
lastTokenWasNewline = true;
}
if (DEBUG) { log.info("Word is " + word + "; " + debugText); }
}
}
if ((newSentForced || newSent) && ( ! currentSentence.isEmpty() || allowEmptySentences)) {
sentences.add(currentSentence);
// adds this sentence now that it's complete
lastSentenceEndForced = ((lastSentence == null || lastSentence.isEmpty()) && lastSentenceEndForced) || newSentForced;
lastSentence = currentSentence;
currentSentence = new ArrayList<>(); // clears the current sentence
if (DEBUG) {
String debugWhy = newSentForced ? " because forced" : " due to regular sentence end";
String debugState = "; lastSentenceEndForced=" + lastSentenceEndForced;
log.info(" beginning new sentence" + debugWhy + debugState);
}
} else if (newSentForced) {
lastSentenceEndForced = true;
if (DEBUG) { log.info(" lastSentenceEndForced=" + lastSentenceEndForced); }
}
}
// add any words at the end, even if there isn't a sentence
// terminator at the end of file
if ( ! currentSentence.isEmpty()) {
sentences.add(currentSentence); // adds last sentence
}
return sentences;
}
public <L, F> Document<L, F, List<IN>> processDocument(Document<L, F, IN> in) {
Document<L, F, List<IN>> doc = in.blankDocument();
doc.addAll(process(in));
return doc;
}
/* ---------- Constructors --------- */
/**
* Create a {@code WordToSentenceProcessor} using a sensible default
* list of tokens for sentence ending for English/Latin writing systems.
* The default set is: {".","?","!"} and
* any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!.
* A sequence of two or more consecutive line breaks is taken as a paragraph break
* which also splits sentences. This is the usual constructor for sentence
* breaking reasonable text, which uses hard-line breaking, so two
* blank lines indicate a paragraph break.
* People commonly use this constructor.
*/
public WordToSentenceProcessor() {
this(false);
}
/**
* Create a {@code WordToSentenceProcessor} using a sensible default
* list of tokens for sentence ending for English/Latin writing systems.
* The default set is: {".","?","!"} and
* any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!.
* You can specify the treatment of newlines as sentence breaks as one
* of ignored, every newline is a sentence break, or only two or more
* consecutive newlines are a sentence break.
*
* @param newlineIsSentenceBreak Strategy for treating newlines as
* paragraph breaks.
*/
public WordToSentenceProcessor(NewlineIsSentenceBreak newlineIsSentenceBreak) {
this(DEFAULT_BOUNDARY_REGEX, newlineIsSentenceBreak, false);
}
/**
* Create a {@code WordToSentenceProcessor} which never breaks the input
* into multiple sentences. If the argument is true, the input stream
* is always output as one sentence. (If it is false, this is
* equivalent to the no argument constructor, so why use this?)
*
* @param isOneSentence Marker argument: true means to treat input
* as one sentence
*/
public WordToSentenceProcessor(boolean isOneSentence) {
this(DEFAULT_BOUNDARY_REGEX, NewlineIsSentenceBreak.TWO_CONSECUTIVE, isOneSentence);
}
/**
* Set the set of Strings that will mark the end of a sentence,
* and which will be discarded after doing so.
* This constructor is used for, and usually only for, doing
* one-sentence-per-line sentence splitting. Since in such cases, you
* generally want to strictly preserve the set of lines in the input,
* it preserves empty lines as empty sentences in the output.
*
* @param boundaryToDiscard A Set of String that will be matched
* with .equals() and will mark an
* end of sentence and be discarded.
*/
public WordToSentenceProcessor(Set<String> boundaryToDiscard) {
this("", "", boundaryToDiscard, null, null,
NewlineIsSentenceBreak.ALWAYS, null, null, false, true);
}
/**
* Create a basic {@code WordToSentenceProcessor} specifying just a few top-level options.
*
* @param boundaryTokenRegex The set of boundary tokens
* @param newlineIsSentenceBreak Strategy for treating newlines as sentence breaks
* @param isOneSentence Whether to treat whole text as one sentence
* (if true, the other two parameters are ignored).
*/
public WordToSentenceProcessor(String boundaryTokenRegex,
NewlineIsSentenceBreak newlineIsSentenceBreak,
boolean isOneSentence) {
this(boundaryTokenRegex, DEFAULT_BOUNDARY_FOLLOWERS_REGEX, DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD,
null, null, newlineIsSentenceBreak, null, null, isOneSentence, false);
}
/**
* Flexibly set the set of acceptable sentence boundary tokens, but with
* a default set of allowed boundary following tokens. Also can set sentence boundary
* to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines
* (boundaryToDiscard) as sentence ends.
*
* This one is convenient in allowing any of the first 3 arguments to be null,
* and then the usual defaults are substituted for it.
* The allowed set of boundary followers is the regex: "[\\p{Pe}\\p{Pf}'\"]|''|-R[CRS]B-".
* The default set of discarded separator tokens includes the
* newline tokens used by WhitespaceLexer and PTBLexer.
*
* @param boundaryTokenRegex The regex of boundary tokens. If null, use default.
* @param boundaryFollowersRegex The regex of boundary following tokens. If null, use default.
* These are tokens which should normally be added on to the current sentence
* even after something normally sentence ending has been seen. For example,
* typically a close parenthesis or close quotes goes with the current sentence,
* even after a period or question mark have been seen.
* @param boundaryToDiscard The set of regex for sentence boundary tokens that should be discarded.
* If null, use default.
* @param xmlBreakElementsToDiscard xml element names like "p", which will be recognized,
* treated as sentence ends, and discarded.
* If null, use none.
* @param newlineIsSentenceBreak Strategy for counting line ends (boundaryToDiscard) as sentence ends.
*/
public WordToSentenceProcessor(String boundaryTokenRegex,
String boundaryFollowersRegex,
Set<String> boundaryToDiscard, Set<String> xmlBreakElementsToDiscard,
NewlineIsSentenceBreak newlineIsSentenceBreak,
SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern,
Set<String> tokenRegexesToDiscard) {
this(boundaryTokenRegex == null ? DEFAULT_BOUNDARY_REGEX : boundaryTokenRegex,
boundaryFollowersRegex == null ? DEFAULT_BOUNDARY_FOLLOWERS_REGEX: boundaryFollowersRegex,
boundaryToDiscard == null || boundaryToDiscard.isEmpty() ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : boundaryToDiscard,
xmlBreakElementsToDiscard == null ? Collections.emptySet() : xmlBreakElementsToDiscard,
null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false);
}
/**
* Configure all parameters for converting a list of tokens into sentences.
* The whole enchilada.
*
* @param boundaryTokenRegex Tokens that match this regex will end a
* sentence, but are retained at the end of
* the sentence. Substantive value must be supplied.
* @param boundaryFollowersRegex This is a Set of String that are matched with
* .equals() which are allowed to be tacked onto
* the end of a sentence after a sentence boundary
* token, for example ")". Substantive value must be supplied.
* @param boundariesToDiscard This is normally used for newline tokens if
* they are included in the tokenization. They
* may end the sentence (depending on the setting
* of newlineIsSentenceBreak), but at any rate
* are deleted from sentences in the output.
* Substantive value must be supplied.
* @param xmlBreakElementsToDiscard These are elements like "p" or "sent",
* which will be wrapped into regex for
* approximate XML matching. They will be
* deleted in the output, and will always
* trigger a sentence boundary.
* May be null; means discard none.
* @param regionElementRegex XML element name regex to delimit regions processed.
* Tokens outside one of these elements are discarded.
* May be null; means to not filter by regions
* @param newlineIsSentenceBreak How to treat newlines. Must have substantive value.
* @param sentenceBoundaryMultiTokenPattern A TokensRegex multi-token pattern for finding boundaries.
* May be null; means that there are no such patterns.
* @param tokenRegexesToDiscard Regex for tokens to discard.
* May be null; means that no tokens are discarded in this way.
* @param isOneSentence Whether to treat whole of input as one sentence regardless.
* Must have substantive value. Overrides anything else.
* @param allowEmptySentences Whether to allow empty sentences to be output
* Must have substantive value. Often suppressed, but don't want that in things like
* strict one-sentence-per-line mode.
*/
public WordToSentenceProcessor(String boundaryTokenRegex, String boundaryFollowersRegex,
Set<String> boundariesToDiscard, Set<String> xmlBreakElementsToDiscard,
String regionElementRegex, NewlineIsSentenceBreak newlineIsSentenceBreak,
SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern,
Set<String> tokenRegexesToDiscard,
boolean isOneSentence, boolean allowEmptySentences) {
sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex);
sentenceBoundaryFollowersPattern = Pattern.compile(boundaryFollowersRegex);
sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard);
if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) {
this.xmlBreakElementsToDiscard = null;
} else {
this.xmlBreakElementsToDiscard = new ArrayList<>(xmlBreakElementsToDiscard.size());
for (String s: xmlBreakElementsToDiscard) {
String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>";
// log.info("Regex is |" + regex + "|");
// todo: Historically case insensitive, but maybe better and more proper to make case sensitive?
this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE));
}
}
if (regionElementRegex != null) {
sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>");
sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>");
} else {
sentenceRegionBeginPattern = null;
sentenceRegionEndPattern = null;
}
this.newlineIsSentenceBreak = newlineIsSentenceBreak;
this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern;
if (tokenRegexesToDiscard != null) {
this.tokenPatternsToDiscard = new ArrayList<>(tokenRegexesToDiscard.size());
for (String s: tokenRegexesToDiscard) {
this.tokenPatternsToDiscard.add(Pattern.compile(s));
}
} else {
this.tokenPatternsToDiscard = null;
}
this.isOneSentence = isOneSentence;
this.allowEmptySentences = allowEmptySentences;
if (DEBUG) {
log.info("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex);
log.info(" boundaryFollowers=" + boundaryFollowersRegex);
log.info(" boundariesToDiscard=" + boundariesToDiscard);
log.info(" xmlBreakElementsToDiscard=" + xmlBreakElementsToDiscard);
log.info(" regionBeginPattern=" + sentenceRegionBeginPattern);
log.info(" regionEndPattern=" + sentenceRegionEndPattern);
log.info(" newlineIsSentenceBreak=" + newlineIsSentenceBreak);
log.info(" sentenceBoundaryMultiTokenPattern=" + sentenceBoundaryMultiTokenPattern);
log.info(" tokenPatternsToDiscard=" + tokenPatternsToDiscard);
log.info(" isOneSentence=" + isOneSentence);
log.info(" allowEmptySentences=" + allowEmptySentences);
log.info(new Exception("above WordToSentenceProcessor invoked from here:"));
}
}
}