Skip to content

Commit

Permalink
Improve cross-lingual support of WordsToSentencesProcessor. CLoses #314
Browse files Browse the repository at this point in the history
…. - use a decent generic Unicode regex for sentence boundary followers - clean up the factory/constructor methods - add to Javadoc - add test on Chinese
  • Loading branch information
manning authored and Stanford NLP committed Dec 3, 2016
1 parent 28eee7f commit 97f09e2
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 136 deletions.
10 changes: 6 additions & 4 deletions src/edu/stanford/nlp/pipeline/AnnotatorFactories.java
Expand Up @@ -161,21 +161,21 @@ public Annotator create() {
boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false")); boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
if (whitespaceTokenization) { if (whitespaceTokenization) {
if (System.lineSeparator().equals("\n")) { if (System.lineSeparator().equals("\n")) {
return WordsToSentencesAnnotator.newlineSplitter(false, "\n"); return WordsToSentencesAnnotator.newlineSplitter("\n");
} else { } else {
// throw "\n" in just in case files use that instead of // throw "\n" in just in case files use that instead of
// the system separator // the system separator
return WordsToSentencesAnnotator.newlineSplitter(false, System.lineSeparator(), "\n"); return WordsToSentencesAnnotator.newlineSplitter(System.lineSeparator(), "\n");
} }
} else { } else {
return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken()); return WordsToSentencesAnnotator.newlineSplitter(PTBTokenizer.getNewlineToken());
} }


} else { } else {
// Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence. // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
String isOneSentence = properties.getProperty("ssplit.isOneSentence"); String isOneSentence = properties.getProperty("ssplit.isOneSentence");
if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
return WordsToSentencesAnnotator.nonSplitter(false); return WordsToSentencesAnnotator.nonSplitter();
} }


// multi token sentence boundaries // multi token sentence boundaries
Expand All @@ -192,6 +192,8 @@ public Annotator create() {
String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex"); String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
Set<String> boundariesToDiscard = null; Set<String> boundariesToDiscard = null;


// todo [cdm 2016]: Add support for specifying ssplit.boundaryFollowerRegex here and send down to WordsToSentencesAnnotator

// newline boundaries which are discarded. // newline boundaries which are discarded.
String bounds = properties.getProperty("ssplit.boundariesToDiscard"); String bounds = properties.getProperty("ssplit.boundariesToDiscard");
if (bounds != null) { if (bounds != null) {
Expand Down
Expand Up @@ -9,7 +9,7 @@ segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.se
segment.sighanPostProcessing = true segment.sighanPostProcessing = true


# sentence split # sentence split
ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[!?]+ ssplit.boundaryTokenRegex = [.]|[!?!?]+


# pos # pos
pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger
Expand Down
12 changes: 5 additions & 7 deletions src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
Expand Up @@ -47,7 +47,7 @@ public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
String newlineIsSentenceBreak, String boundaryMultiTokenRegex, String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
Set<String> tokenRegexesToDiscard) { Set<String> tokenRegexesToDiscard) {
this(verbose, false, this(verbose, false,
new WordToSentenceProcessor<>(boundaryTokenRegex, new WordToSentenceProcessor<>(boundaryTokenRegex, null,
boundaryToDiscard, htmlElementsToDiscard, boundaryToDiscard, htmlElementsToDiscard,
WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak), WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
(boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard)); (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard));
Expand All @@ -69,27 +69,25 @@ private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
* are used in numbering the sentence. Only this constructor leads to * are used in numbering the sentence. Only this constructor leads to
* empty sentences. * empty sentences.
* *
* @param verbose Whether it is verbose.
* @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake * @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
* newline tokens returned from the tokenizer. * newline tokens returned from the tokenizer.
* @return A WordsToSentenceAnnotator. * @return A WordsToSentenceAnnotator.
*/ */
public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) { public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
// this constructor will keep empty lines as empty sentences // this constructor will keep empty lines as empty sentences
WordToSentenceProcessor<CoreLabel> wts = WordToSentenceProcessor<CoreLabel> wts =
new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken)); new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
return new WordsToSentencesAnnotator(verbose, true, wts); return new WordsToSentencesAnnotator(false, true, wts);
} }




/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence. /** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
* *
* @param verbose Whether it is verbose.
* @return A WordsToSentenceAnnotator. * @return A WordsToSentenceAnnotator.
*/ */
public static WordsToSentencesAnnotator nonSplitter(boolean verbose) { public static WordsToSentencesAnnotator nonSplitter() {
WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true); WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
return new WordsToSentencesAnnotator(verbose, false, wts); return new WordsToSentencesAnnotator(false, false, wts);
} }




Expand Down

0 comments on commit 97f09e2

Please sign in to comment.