Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanxml #1259

Merged
merged 5 commits into from
Mar 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 19 additions & 9 deletions itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,38 @@ public void testNotSpanish() {
assertEquals("Damelo", ann.get(CoreAnnotations.TokensAnnotation.class).get(0).word());
}

private static final String spanishText = "Me voy a Madrid (ES).\n\"Me gusta\", lo dice.";
private static List<String> spanishTokens = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
private static List<String> spanishTokens2 = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", AbstractTokenizer.NEWLINE_TOKEN, "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
private static final String spanishText = "Me voy a Madrid (ES)\n\n\"Me gusta\", lo dice.";
private static final String[] spanishTokens = { "Me", "voy", "a", "Madrid", "(", "ES", ")", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." };

public void testSpanishTokenizer() {
TokenizerAnnotator annotator = new TokenizerAnnotator(false, "es", null);
Properties props = new Properties();
props.setProperty("tokenize.language", "es");

TokenizerAnnotator annotator = new TokenizerAnnotator(false, props);
Annotation annotation = new Annotation(spanishText);
annotator.annotate(annotation);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(spanishTokens.size(), tokens.size());
assertEquals(spanishTokens.length, tokens.size());
for (int i = 0; i < tokens.size(); ++i) {
assertEquals(spanishTokens.get(i), tokens.get(i).value());
assertEquals(spanishTokens[i], tokens.get(i).value());
}
assertEquals(1, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());

// the difference here with NEWLINE_... = two, tokenizeNLs is on
// and there will be two sentences
// the sentence splitter inside the TokenizerAnnotator will see
// the *NL* and split a second sentence there
props.setProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "two");

annotator = new TokenizerAnnotator(false, "es", "tokenizeNLs,");
annotator = new TokenizerAnnotator(false, props);
annotation = new Annotation(spanishText);
annotator.annotate(annotation);
tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
assertEquals(spanishTokens2.size(), tokens.size());
assertEquals(spanishTokens.length, tokens.size());
for (int i = 0; i < tokens.size(); ++i) {
assertEquals(spanishTokens2.get(i), tokens.get(i).value());
assertEquals(spanishTokens[i], tokens.get(i).value());
}
assertEquals(2, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
}

}
44 changes: 22 additions & 22 deletions src/edu/stanford/nlp/pipeline/Annotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
put(STANFORD_CDC_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_DOCDATE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
put(STANFORD_TOKENSREGEX, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
put(STANFORD_COLUMN_DATA_CLASSIFIER, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
// TODO: there are language specific dependencies which we may
// want to encode somehow. For example, English KBP needs coref
// to function. Spanish KBP doesn't need coref, and in fact,
// Spanish coref doesn't even exist.
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
}};

}
64 changes: 64 additions & 0 deletions src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
this.properties.setProperty("annotators", newAnnotators);
}

normalizeAnnotators(this.properties);

// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());

Expand Down Expand Up @@ -303,6 +305,68 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
this.pipelineSetupTime = tim.report();
}

/**
* update the annotators, hopefully in a backwards compatible manner
*/
static void normalizeAnnotators(Properties properties) {
// if cleanxml is requested and tokenize is here,
// make it part of tokenize rather than its own annotator
unifyTokenizeProperty(properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
// ssplit is always part of tokenize now
unifyTokenizeProperty(properties, STANFORD_SSPLIT, null);
// cdc_tokenize is also absorbed into tokenize
replaceAnnotator(properties, STANFORD_CDC_TOKENIZE, STANFORD_TOKENIZE);
}

/**
* The cdc_tokenize annotator is now part of tokenize
*/
static void replaceAnnotator(Properties properties, String oldAnnotator, String newAnnotator) {
String annotators = properties.getProperty("annotators", "");
String replaced = annotators.replace(oldAnnotator, newAnnotator);
if (!replaced.equals(annotators)) {
logger.debug("|" + oldAnnotator + "| is now part of |" + newAnnotator + "|. Annotators updated to |" + replaced + "|");
properties.setProperty("annotators", replaced);
}
}

/**
* The cleanxml annotator can now be invoked as part of the tokenize annotator.
*<br>
* To ensure backwards compatibility with previous usage of the pipeline,
* we allow annotators to be specified tokenize,cleanxml.
* In such a case, we remove the cleanxml from the annotators and set
* the tokenize.cleanxml option instead
*/
static void unifyTokenizeProperty(Properties properties, String property, String option) {
String annotators = properties.getProperty("annotators", "");
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
int unwanted = annotators.indexOf(property);

if (unwanted >= 0 && tokenize >= 0) {
if (option != null) {
properties.setProperty(option, "true");
}
int comma = annotators.indexOf(",", unwanted);
if (comma >= 0) {
annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
} else {
comma = annotators.lastIndexOf(",");
if (comma < 0) {
throw new IllegalArgumentException("Unable to process annotators " + annotators);
}
annotators = annotators.substring(0, comma);
}
if (option != null) {
logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
} else {
logger.debug(property + " is now included as part of the tokenize annotator by default");
}
logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
properties.setProperty("annotators", annotators);
}
}

//
// @Override-able methods to change pipeline behavior
//
Expand Down
1 change: 1 addition & 0 deletions src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ private Properties getProperties(HttpExchange httpExchange) throws UnsupportedEn
urlProperties.forEach(props::setProperty);

// Get the annotators
StanfordCoreNLP.normalizeAnnotators(props);
String annotators = props.getProperty("annotators");
// If the properties contains a custom annotator, then do not enforceRequirements.
if (annotators != null && !PropertiesUtils.hasPropertyPrefix(props, CUSTOM_ANNOTATOR_PREFIX) && PropertiesUtils.getBool(props, "enforceRequirements", true)) {
Expand Down
62 changes: 43 additions & 19 deletions src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,13 @@ public static TokenizerType getTokenizerType(Properties props) {
private final TokenizerFactory<CoreLabel> factory;

/** new segmenter properties **/
private final boolean useSegmenter;
private final Annotator segmenterAnnotator;
/** If not null, will use this instead of a lexer or segmenter */
private final StatTokSentAnnotator cdcAnnotator;
private final CleanXmlAnnotator cleanxmlAnnotator;
private final WordsToSentencesAnnotator ssplitAnnotator;

/** run a custom post processor after the lexer **/
/** run a custom post processor after the lexer. DOES NOT apply to segmenters **/
private final List<CoreLabelProcessor> postProcessors;

// CONSTRUCTORS
Expand Down Expand Up @@ -195,7 +198,7 @@ public TokenizerAnnotator(boolean verbose, String lang, String options) {
}

public TokenizerAnnotator(boolean verbose, Properties props) {
this(verbose, props, null);
this(verbose, props, computeExtraOptions(props));
}

public TokenizerAnnotator(boolean verbose, Properties props, String options) {
Expand All @@ -205,23 +208,24 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
// check if segmenting must be done (Chinese or Arabic and not tokenizing on whitespace)
boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false"));
if (props.getProperty("tokenize.language") != null &&
LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language"))
&& !whitespace) {
useSegmenter = true;
if (LanguageInfo.getLanguageFromString(
props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC)
LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language")) &&
!whitespace) {
cdcAnnotator = null;
if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) {
segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
else if (LanguageInfo.getLanguageFromString(
props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE)
} else if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE) {
segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
else {
} else {
segmenterAnnotator = null;
throw new RuntimeException("No segmenter implemented for: "+
LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")));
LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")));
}
} else if (props.getProperty(STANFORD_CDC_TOKENIZE + ".model", null) != null) {
cdcAnnotator = new StatTokSentAnnotator(props);
segmenterAnnotator = null;
} else {
useSegmenter = false;
segmenterAnnotator = null;
cdcAnnotator = null;
}

// load any custom token post processing
Expand All @@ -245,6 +249,14 @@ else if (LanguageInfo.getLanguageFromString(
if (VERBOSE) {
log.info("Initialized tokenizer factory: " + factory);
}

if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML)) {
this.cleanxmlAnnotator = new CleanXmlAnnotator(props);
} else {
this.cleanxmlAnnotator = null;
}

this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
}

/**
Expand Down Expand Up @@ -374,16 +386,22 @@ public void annotate(Annotation annotation) {
log.info("Beginning tokenization");
}

if (cdcAnnotator != null) {
cdcAnnotator.annotate(annotation);
// the CDC annotator does tokenize, ssplit, and mwt (if we even
// integrate that into tokenize), so we just leave once it's
// done. the unique internal workings of that tokenizer prevent
// cleanxml from working, at least for now
return;
}

// for Arabic and Chinese use a segmenter instead
if (useSegmenter) {
if (segmenterAnnotator != null) {
segmenterAnnotator.annotate(annotation);
// set indexes into document wide tokens list
setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
setNewlineStatus(annotation.get(CoreAnnotations.TokensAnnotation.class));
return;
}

if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
} else if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
// TODO: This is a huge hack. jflex does not have a lookahead operation which can match EOF
// Because of this, the PTBTokenizer has a few productions which can't operate at EOF.
// For example,
Expand Down Expand Up @@ -424,6 +442,10 @@ public void annotate(Annotation annotation) {
throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
}

if (this.cleanxmlAnnotator != null) {
this.cleanxmlAnnotator.annotate(annotation);
}
this.ssplitAnnotator.annotate(annotation);
}

@Override
Expand All @@ -446,7 +468,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.ValueAnnotation.class,
CoreAnnotations.IsNewlineAnnotation.class
CoreAnnotations.IsNewlineAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.SentenceIndexAnnotation.class
));
}

Expand Down