Skip to content

Commit

Permalink
Improve cross-lingual support of WordsToSentencesProcessor. CLoses #314
Browse files Browse the repository at this point in the history
…. - use a decent generic Unicode regex for sentence boundary followers - clean up the factory/constructor methods - add to Javadoc - add test on Chinese
  • Loading branch information
manning authored and Stanford NLP committed Dec 3, 2016
1 parent 28eee7f commit 97f09e2
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 136 deletions.
10 changes: 6 additions & 4 deletions src/edu/stanford/nlp/pipeline/AnnotatorFactories.java
Expand Up @@ -161,21 +161,21 @@ public Annotator create() {
boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
if (whitespaceTokenization) {
if (System.lineSeparator().equals("\n")) {
return WordsToSentencesAnnotator.newlineSplitter(false, "\n");
return WordsToSentencesAnnotator.newlineSplitter("\n");
} else {
// throw "\n" in just in case files use that instead of
// the system separator
return WordsToSentencesAnnotator.newlineSplitter(false, System.lineSeparator(), "\n");
return WordsToSentencesAnnotator.newlineSplitter(System.lineSeparator(), "\n");
}
} else {
return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken());
return WordsToSentencesAnnotator.newlineSplitter(PTBTokenizer.getNewlineToken());
}

} else {
// Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
String isOneSentence = properties.getProperty("ssplit.isOneSentence");
if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
return WordsToSentencesAnnotator.nonSplitter(false);
return WordsToSentencesAnnotator.nonSplitter();
}

// multi token sentence boundaries
Expand All @@ -192,6 +192,8 @@ public Annotator create() {
String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
Set<String> boundariesToDiscard = null;

// todo [cdm 2016]: Add support for specifying ssplit.boundaryFollowerRegex here and send down to WordsToSentencesAnnotator

// newline boundaries which are discarded.
String bounds = properties.getProperty("ssplit.boundariesToDiscard");
if (bounds != null) {
Expand Down
Expand Up @@ -9,7 +9,7 @@ segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.se
segment.sighanPostProcessing = true

# sentence split
ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[!?]+
ssplit.boundaryTokenRegex = [.]|[!?!?]+

# pos
pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger
Expand Down
12 changes: 5 additions & 7 deletions src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
Expand Up @@ -47,7 +47,7 @@ public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
Set<String> tokenRegexesToDiscard) {
this(verbose, false,
new WordToSentenceProcessor<>(boundaryTokenRegex,
new WordToSentenceProcessor<>(boundaryTokenRegex, null,
boundaryToDiscard, htmlElementsToDiscard,
WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
(boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard));
Expand All @@ -69,27 +69,25 @@ private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
* are used in numbering the sentence. Only this constructor leads to
* empty sentences.
*
* @param verbose Whether it is verbose.
* @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
* newline tokens returned from the tokenizer.
* @return A WordsToSentenceAnnotator.
*/
public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) {
public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
// this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
return new WordsToSentencesAnnotator(verbose, true, wts);
return new WordsToSentencesAnnotator(false, true, wts);
}


/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
*
* @param verbose Whether it is verbose.
* @return A WordsToSentenceAnnotator.
*/
public static WordsToSentencesAnnotator nonSplitter(boolean verbose) {
public static WordsToSentencesAnnotator nonSplitter() {
WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
return new WordsToSentencesAnnotator(verbose, false, wts);
return new WordsToSentencesAnnotator(false, false, wts);
}


Expand Down

0 comments on commit 97f09e2

Please sign in to comment.