Merge the ssplit into the tokenize annotator

stanfordnlp · Mar 17, 2022 · 55595d3 · 55595d3
1 parent 5f6df2d
commit 55595d3
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 32 deletions.
diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
     put(STANFORD_CDC_TOKENIZE,             new LinkedHashSet<>(Arrays.asList()));
     put(STANFORD_CLEAN_XML,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_SSPLIT,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
+    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_DOCDATE,                  new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
+    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
     put(STANFORD_TOKENSREGEX,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
-    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
-    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
+    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
+    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
+    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
     put(STANFORD_COLUMN_DATA_CLASSIFIER,   new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
-    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
-    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
-    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
-    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
-    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
+    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
+    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
+    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
+    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
+    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
+    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
     // TODO: there are language specific dependencies which we may
     // want to encode somehow.  For example, English KBP needs coref
     // to function.  Spanish KBP doesn't need coref, and in fact,
     // Spanish coref doesn't even exist.
-    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
   }};
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
 
     // if cleanxml is requested and tokenize is here,
     // make it part of tokenize rather than its own annotator
-    unifyCleanXML(this.properties);
+    unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
+    // ssplit is always part of tokenize now
+    unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
 
     // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
     this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
    * In such a case, we remove the cleanxml from the annotators and set
    * the tokenize.cleanxml option instead
    */
-  static void unifyCleanXML(Properties properties) {
+  static void unifyTokenizeProperty(Properties properties, String property, String option) {
     String annotators = properties.getProperty("annotators", "");
     int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
-    int clean = annotators.indexOf(STANFORD_CLEAN_XML);
+    int unwanted = annotators.indexOf(property);
 
-    if (clean >= 0 && tokenize >= 0) {
-      properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
-      int comma = annotators.indexOf(",", clean);
+    if (unwanted >= 0 && tokenize >= 0) {
+      if (option != null) {
+        properties.setProperty(option, "true");
+      }
+      int comma = annotators.indexOf(",", unwanted);
       if (comma >= 0) {
-        annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
+        annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
       } else {
         comma = annotators.lastIndexOf(",");
         if (comma < 0) {
           throw new IllegalArgumentException("Unable to process annotators " + annotators);
         }
         annotators = annotators.substring(0, comma);
       }
-      logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true  Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
+      if (option != null) {
+        logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
+      } else {
+        logger.debug(property + " is now included as part of the tokenize annotator by default");
+      }
+      logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
       properties.setProperty("annotators", annotators);
     }
   }

diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
   private final boolean useSegmenter;
   private final Annotator segmenterAnnotator;
   private final CleanXmlAnnotator cleanxmlAnnotator;
+  private final WordsToSentencesAnnotator ssplitAnnotator;
 
   /** run a custom post processor after the lexer **/
   private final List<CoreLabelProcessor> postProcessors;
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
     } else {
       this.cleanxmlAnnotator = null;
     }
+
+    this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
   }
 
   /**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
     if (this.cleanxmlAnnotator != null) {
       this.cleanxmlAnnotator.annotate(annotation);
     }
+    this.ssplitAnnotator.annotate(annotation);
   }
 
   @Override
@@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
         CoreAnnotations.IndexAnnotation.class,
         CoreAnnotations.OriginalTextAnnotation.class,
         CoreAnnotations.ValueAnnotation.class,
-        CoreAnnotations.IsNewlineAnnotation.class
+        CoreAnnotations.IsNewlineAnnotation.class,
+        CoreAnnotations.SentencesAnnotation.class,
+        CoreAnnotations.SentenceIndexAnnotation.class
     ));
   }
 

diff --git a/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java b/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
     for (int i = 0; i < inputs.length; ++i) {
       Properties props = new Properties();
       props.setProperty("annotators", inputs[i]);
-      StanfordCoreNLP.unifyCleanXML(props);
+      StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
       assertEquals(expected[i], props.getProperty("annotators"));
       assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
     }