Skip to content

Commit

Permalink
Merge the ssplit into the tokenize annotator
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Mar 17, 2022
1 parent 5f6df2d commit 55595d3
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 32 deletions.
44 changes: 22 additions & 22 deletions src/edu/stanford/nlp/pipeline/Annotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
put(STANFORD_CDC_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_DOCDATE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
put(STANFORD_TOKENSREGEX, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
put(STANFORD_COLUMN_DATA_CLASSIFIER, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
// TODO: there are language specific dependencies which we may
// want to encode somehow. For example, English KBP needs coref
// to function. Spanish KBP doesn't need coref, and in fact,
// Spanish coref doesn't even exist.
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
}};

}
25 changes: 17 additions & 8 deletions src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP

// if cleanxml is requested and tokenize is here,
// make it part of tokenize rather than its own annotator
unifyCleanXML(this.properties);
unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
// ssplit is always part of tokenize now
unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);

// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
Expand Down Expand Up @@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
* In such a case, we remove the cleanxml from the annotators and set
* the tokenize.cleanxml option instead
*/
static void unifyCleanXML(Properties properties) {
static void unifyTokenizeProperty(Properties properties, String property, String option) {
String annotators = properties.getProperty("annotators", "");
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
int clean = annotators.indexOf(STANFORD_CLEAN_XML);
int unwanted = annotators.indexOf(property);

if (clean >= 0 && tokenize >= 0) {
properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
int comma = annotators.indexOf(",", clean);
if (unwanted >= 0 && tokenize >= 0) {
if (option != null) {
properties.setProperty(option, "true");
}
int comma = annotators.indexOf(",", unwanted);
if (comma >= 0) {
annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
} else {
comma = annotators.lastIndexOf(",");
if (comma < 0) {
throw new IllegalArgumentException("Unable to process annotators " + annotators);
}
annotators = annotators.substring(0, comma);
}
logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
if (option != null) {
logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
} else {
logger.debug(property + " is now included as part of the tokenize annotator by default");
}
logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
properties.setProperty("annotators", annotators);
}
}
Expand Down
8 changes: 7 additions & 1 deletion src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
private final boolean useSegmenter;
private final Annotator segmenterAnnotator;
private final CleanXmlAnnotator cleanxmlAnnotator;
private final WordsToSentencesAnnotator ssplitAnnotator;

/** run a custom post processor after the lexer **/
private final List<CoreLabelProcessor> postProcessors;
Expand Down Expand Up @@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
} else {
this.cleanxmlAnnotator = null;
}

this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
}

/**
Expand Down Expand Up @@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
if (this.cleanxmlAnnotator != null) {
this.cleanxmlAnnotator.annotate(annotation);
}
this.ssplitAnnotator.annotate(annotation);
}

@Override
Expand All @@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.ValueAnnotation.class,
CoreAnnotations.IsNewlineAnnotation.class
CoreAnnotations.IsNewlineAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class
));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
for (int i = 0; i < inputs.length; ++i) {
Properties props = new Properties();
props.setProperty("annotators", inputs[i]);
StanfordCoreNLP.unifyCleanXML(props);
StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
assertEquals(expected[i], props.getProperty("annotators"));
assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
}
Expand Down

0 comments on commit 55595d3

Please sign in to comment.