stanfordnlp · AngledLuffa · Mar 20, 2022 · Mar 16, 2022 · Mar 16, 2022 · Mar 16, 2022
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java
@@ -29,28 +29,38 @@ public void testNotSpanish() {
     assertEquals("Damelo", ann.get(CoreAnnotations.TokensAnnotation.class).get(0).word());
   }
 
-  private static final String spanishText = "Me voy a Madrid (ES).\n\"Me gusta\", lo dice.";
-  private static List<String> spanishTokens = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
-  private static List<String> spanishTokens2 = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", AbstractTokenizer.NEWLINE_TOKEN, "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
+  private static final String spanishText = "Me voy a Madrid (ES)\n\n\"Me gusta\", lo dice.";
+  private static final String[] spanishTokens = { "Me", "voy", "a", "Madrid", "(", "ES", ")", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." };
 
   public void testSpanishTokenizer() {
-    TokenizerAnnotator annotator = new TokenizerAnnotator(false, "es", null);
+    Properties props = new Properties();
+    props.setProperty("tokenize.language", "es");
+
+    TokenizerAnnotator annotator = new TokenizerAnnotator(false, props);
     Annotation annotation = new Annotation(spanishText);
     annotator.annotate(annotation);
     List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
-    assertEquals(spanishTokens.size(), tokens.size());
+    assertEquals(spanishTokens.length, tokens.size());
     for (int i = 0; i < tokens.size(); ++i) {
-      assertEquals(spanishTokens.get(i), tokens.get(i).value());
+      assertEquals(spanishTokens[i], tokens.get(i).value());
     }
+    assertEquals(1, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
+
+    // the difference here with NEWLINE_... = two, tokenizeNLs is on
+    // and there will be two sentences
+    // the sentence splitter inside the TokenizerAnnotator will see
+    // the *NL* and split a second sentence there
+    props.setProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "two");
 
-    annotator = new TokenizerAnnotator(false, "es", "tokenizeNLs,");
+    annotator = new TokenizerAnnotator(false, props);
     annotation = new Annotation(spanishText);
     annotator.annotate(annotation);
     tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
-    assertEquals(spanishTokens2.size(), tokens.size());
+    assertEquals(spanishTokens.length, tokens.size());
     for (int i = 0; i < tokens.size(); ++i) {
-      assertEquals(spanishTokens2.get(i), tokens.get(i).value());
+      assertEquals(spanishTokens[i], tokens.get(i).value());
     }
+    assertEquals(2, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
   }
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
     put(STANFORD_CDC_TOKENIZE,             new LinkedHashSet<>(Arrays.asList()));
     put(STANFORD_CLEAN_XML,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_SSPLIT,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
+    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_DOCDATE,                  new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
+    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
     put(STANFORD_TOKENSREGEX,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
-    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
-    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
+    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
+    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
+    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
     put(STANFORD_COLUMN_DATA_CLASSIFIER,   new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
-    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
-    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
-    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
-    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
-    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
+    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
+    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
+    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
+    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
+    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
+    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
     // TODO: there are language specific dependencies which we may
     // want to encode somehow.  For example, English KBP needs coref
     // to function.  Spanish KBP doesn't need coref, and in fact,
     // Spanish coref doesn't even exist.
-    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
   }};
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -255,6 +255,8 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
       this.properties.setProperty("annotators", newAnnotators);
     }
 
+    normalizeAnnotators(this.properties);
+
     // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
     this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
 
@@ -303,6 +305,68 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
     this.pipelineSetupTime = tim.report();
   }
 
+  /**
+   * update the annotators, hopefully in a backwards compatible manner
+   */
+  static void normalizeAnnotators(Properties properties) {
+    // if cleanxml is requested and tokenize is here,
+    // make it part of tokenize rather than its own annotator
+    unifyTokenizeProperty(properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
+    // ssplit is always part of tokenize now
+    unifyTokenizeProperty(properties, STANFORD_SSPLIT, null);
+    // cdc_tokenize is also absorbed into tokenize
+    replaceAnnotator(properties, STANFORD_CDC_TOKENIZE, STANFORD_TOKENIZE);
+  }
+
+  /**
+   * The cdc_tokenize annotator is now part of tokenize
+   */
+  static void replaceAnnotator(Properties properties, String oldAnnotator, String newAnnotator) {
+    String annotators = properties.getProperty("annotators", "");
+    String replaced = annotators.replace(oldAnnotator, newAnnotator);
+    if (!replaced.equals(annotators)) {
+      logger.debug("|" + oldAnnotator + "| is now part of |" + newAnnotator + "|.  Annotators updated to |" + replaced + "|");
+      properties.setProperty("annotators", replaced);
+    }
+  }
+
+  /**
+   * The cleanxml annotator can now be invoked as part of the tokenize annotator.
+   *<br>
+   * To ensure backwards compatibility with previous usage of the pipeline,
+   * we allow annotators to be specified tokenize,cleanxml.
+   * In such a case, we remove the cleanxml from the annotators and set
+   * the tokenize.cleanxml option instead
+   */
+  static void unifyTokenizeProperty(Properties properties, String property, String option) {
+    String annotators = properties.getProperty("annotators", "");
+    int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
+    int unwanted = annotators.indexOf(property);
+
+    if (unwanted >= 0 && tokenize >= 0) {
+      if (option != null) {
+        properties.setProperty(option, "true");
+      }
+      int comma = annotators.indexOf(",", unwanted);
+      if (comma >= 0) {
+        annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
+      } else {
+        comma = annotators.lastIndexOf(",");
+        if (comma < 0) {
+          throw new IllegalArgumentException("Unable to process annotators " + annotators);
+        }
+        annotators = annotators.substring(0, comma);
+      }
+      if (option != null) {
+        logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
+      } else {
+        logger.debug(property + " is now included as part of the tokenize annotator by default");
+      }
+      logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
+      properties.setProperty("annotators", annotators);
+    }
+  }
+
   //
   // @Override-able methods to change pipeline behavior
   //

diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
@@ -472,6 +472,7 @@ private Properties getProperties(HttpExchange httpExchange) throws UnsupportedEn
     urlProperties.forEach(props::setProperty);
 
     // Get the annotators
+    StanfordCoreNLP.normalizeAnnotators(props);
     String annotators = props.getProperty("annotators");
     // If the properties contains a custom annotator, then do not enforceRequirements.
     if (annotators != null && !PropertiesUtils.hasPropertyPrefix(props, CUSTOM_ANNOTATOR_PREFIX) && PropertiesUtils.getBool(props, "enforceRequirements", true)) {

diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -131,10 +131,13 @@ public static TokenizerType getTokenizerType(Properties props) {
   private final TokenizerFactory<CoreLabel> factory;
 
   /** new segmenter properties **/
-  private final boolean useSegmenter;
   private final Annotator segmenterAnnotator;
+  /** If not null, will use this instead of a lexer or segmenter */
+  private final StatTokSentAnnotator cdcAnnotator;
+  private final CleanXmlAnnotator cleanxmlAnnotator;
+  private final WordsToSentencesAnnotator ssplitAnnotator;
 
-  /** run a custom post processor after the lexer **/
+  /** run a custom post processor after the lexer.  DOES NOT apply to segmenters **/
   private final List<CoreLabelProcessor> postProcessors;
 
   // CONSTRUCTORS
@@ -195,7 +198,7 @@ public TokenizerAnnotator(boolean verbose, String lang, String options) {
   }
 
   public TokenizerAnnotator(boolean verbose, Properties props) {
-    this(verbose, props, null);
+    this(verbose, props, computeExtraOptions(props));
   }
 
   public TokenizerAnnotator(boolean verbose, Properties props, String options) {
@@ -205,23 +208,24 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
     // check if segmenting must be done (Chinese or Arabic and not tokenizing on whitespace)
     boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false"));
     if (props.getProperty("tokenize.language") != null &&
-            LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language"))
-        && !whitespace) {
-      useSegmenter = true;
-      if (LanguageInfo.getLanguageFromString(
-              props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC)
+        LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language")) &&
+        !whitespace) {
+      cdcAnnotator = null;
+      if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) {
         segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
-      else if (LanguageInfo.getLanguageFromString(
-              props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE)
+      } else if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE) {
         segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
-      else {
+      } else {
         segmenterAnnotator = null;
         throw new RuntimeException("No segmenter implemented for: "+
-                LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")));
+                                   LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")));
       }
+    } else if (props.getProperty(STANFORD_CDC_TOKENIZE + ".model", null) != null) {
+      cdcAnnotator = new StatTokSentAnnotator(props);
+      segmenterAnnotator = null;
     } else {
-      useSegmenter = false;
       segmenterAnnotator = null;
+      cdcAnnotator = null;
     }
 
     // load any custom token post processing
@@ -245,6 +249,14 @@ else if (LanguageInfo.getLanguageFromString(
     if (VERBOSE) {
       log.info("Initialized tokenizer factory: " + factory);
     }
+
+    if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML)) {
+      this.cleanxmlAnnotator = new CleanXmlAnnotator(props);
+    } else {
+      this.cleanxmlAnnotator = null;
+    }
+
+    this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
   }
 
   /**
@@ -374,16 +386,22 @@ public void annotate(Annotation annotation) {
       log.info("Beginning tokenization");
     }
 
+    if (cdcAnnotator != null) {
+      cdcAnnotator.annotate(annotation);
+      // the CDC annotator does tokenize, ssplit, and mwt (if we even
+      // integrate that into tokenize), so we just leave once it's
+      // done.  the unique internal workings of that tokenizer prevent
+      // cleanxml from working, at least for now
+      return;
+    }
+
     // for Arabic and Chinese use a segmenter instead
-    if (useSegmenter) {
+    if (segmenterAnnotator != null) {
       segmenterAnnotator.annotate(annotation);
       // set indexes into document wide tokens list
       setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
       setNewlineStatus(annotation.get(CoreAnnotations.TokensAnnotation.class));
-      return;
-    }
-
-    if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
+    } else if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
       // TODO: This is a huge hack.  jflex does not have a lookahead operation which can match EOF
       // Because of this, the PTBTokenizer has a few productions which can't operate at EOF.
       // For example,
@@ -424,6 +442,10 @@ public void annotate(Annotation annotation) {
       throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
     }
 
+    if (this.cleanxmlAnnotator != null) {
+      this.cleanxmlAnnotator.annotate(annotation);
+    }
+    this.ssplitAnnotator.annotate(annotation);
   }
 
   @Override
@@ -446,7 +468,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
         CoreAnnotations.IndexAnnotation.class,
         CoreAnnotations.OriginalTextAnnotation.class,
         CoreAnnotations.ValueAnnotation.class,
-        CoreAnnotations.IsNewlineAnnotation.class
+        CoreAnnotations.IsNewlineAnnotation.class,
+        CoreAnnotations.SentencesAnnotation.class,
+        CoreAnnotations.SentenceIndexAnnotation.class
     ));
   }