Improve cross-lingual support of WordsToSentencesProcessor. CLoses #314…

…. - use a decent generic Unicode regex for sentence boundary followers - clean up the factory/constructor methods - add to Javadoc - add test on Chinese
stanfordnlp · Dec 3, 2016 · 97f09e2 · 97f09e2
1 parent 28eee7f
commit 97f09e2
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 136 deletions.
diff --git a/src/edu/stanford/nlp/pipeline/AnnotatorFactories.java b/src/edu/stanford/nlp/pipeline/AnnotatorFactories.java
@@ -161,21 +161,21 @@ public Annotator create() {
           boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
           if (whitespaceTokenization) {
             if (System.lineSeparator().equals("\n")) {
-              return WordsToSentencesAnnotator.newlineSplitter(false, "\n");
+              return WordsToSentencesAnnotator.newlineSplitter("\n");
             } else {
               // throw "\n" in just in case files use that instead of
               // the system separator
-              return WordsToSentencesAnnotator.newlineSplitter(false, System.lineSeparator(), "\n");
+              return WordsToSentencesAnnotator.newlineSplitter(System.lineSeparator(), "\n");
             }
           } else {
-            return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken());
+            return WordsToSentencesAnnotator.newlineSplitter(PTBTokenizer.getNewlineToken());
           }
 
         } else {
           // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
           String isOneSentence = properties.getProperty("ssplit.isOneSentence");
           if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
-            return WordsToSentencesAnnotator.nonSplitter(false);
+            return WordsToSentencesAnnotator.nonSplitter();
           }
 
           // multi token sentence boundaries
@@ -192,6 +192,8 @@ public Annotator create() {
           String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
           Set<String> boundariesToDiscard = null;
 
+          // todo [cdm 2016]: Add support for specifying ssplit.boundaryFollowerRegex here and send down to WordsToSentencesAnnotator
+
           // newline boundaries which are discarded.
           String bounds = properties.getProperty("ssplit.boundariesToDiscard");
           if (bounds != null) {

diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties
@@ -9,7 +9,7 @@ segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.se
 segment.sighanPostProcessing = true
 
 # sentence split
-ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[！？]+
+ssplit.boundaryTokenRegex = [.。]|[!?！？]+
 
 # pos
 pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger

diff --git a/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java b/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
@@ -47,7 +47,7 @@ public WordsToSentencesAnnotator(boolean verbose, String boundaryTokenRegex,
                                    String newlineIsSentenceBreak, String boundaryMultiTokenRegex,
                                    Set<String> tokenRegexesToDiscard) {
     this(verbose, false,
-            new WordToSentenceProcessor<>(boundaryTokenRegex,
+            new WordToSentenceProcessor<>(boundaryTokenRegex, null,
                     boundaryToDiscard, htmlElementsToDiscard,
                     WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
                     (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard));
@@ -69,27 +69,25 @@ private WordsToSentencesAnnotator(boolean verbose, boolean countLineNumbers,
    *  are used in numbering the sentence. Only this constructor leads to
    *  empty sentences.
    *
-   *  @param verbose Whether it is verbose.
    *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
    *                 newline tokens returned from the tokenizer.
    *  @return A WordsToSentenceAnnotator.
    */
-  public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) {
+  public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
     // this constructor will keep empty lines as empty sentences
     WordToSentenceProcessor<CoreLabel> wts =
             new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
-    return new WordsToSentencesAnnotator(verbose, true, wts);
+    return new WordsToSentencesAnnotator(false, true, wts);
   }
 
 
   /** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
    *
-   *  @param verbose Whether it is verbose.
    *  @return A WordsToSentenceAnnotator.
    */
-  public static WordsToSentencesAnnotator nonSplitter(boolean verbose) {
+  public static WordsToSentencesAnnotator nonSplitter() {
     WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
-    return new WordsToSentencesAnnotator(verbose, false, wts);
+    return new WordsToSentencesAnnotator(false, false, wts);
   }