Skip to content

Commit

Permalink
Merge branch 'master' of ssh://origin
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa authored and Stanford NLP committed Jun 24, 2019
1 parent 737422c commit 8eaefe6
Show file tree
Hide file tree
Showing 15 changed files with 313 additions and 82 deletions.
4 changes: 3 additions & 1 deletion build.xml
Expand Up @@ -160,7 +160,8 @@
<target name="itest" depends="classpath,compile"
description="Run core integration tests">
<echo message="${ant.project.name}" />
<junit fork="yes" maxmemory="12g" printsummary="off" outputtoformatters="false" forkmode="perTest" haltonfailure="true">
<junit fork="yes" maxmemory="12g" printsummary="off" outputtoformatters="false" forkmode="perTest" haltonfailure="no"
haltonerror="no" failureproperty="test.failed" errorproperty="test.failed">
<classpath refid="classpath"/>
<classpath path="${build.path}"/>
<classpath path="${data.path}"/>
Expand All @@ -173,6 +174,7 @@
</fileset>
</batchtest>
</junit>
<fail message="Test failure detected, check test results." if="test.failed" />
</target>

<target name="itest-many-docs" depends="classpath"
Expand Down
20 changes: 14 additions & 6 deletions itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java
Expand Up @@ -5,12 +5,14 @@
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Sets;
import org.junit.Ignore;
import org.junit.Test;

import java.util.*;

import java.util.stream.Collectors;

import static org.junit.Assert.*;

/**
Expand Down Expand Up @@ -53,11 +55,12 @@ public void assertExtracted(String expected, String text) {

public void assertExtracted(Set<String> expectedSet, String text) {
Collection<RelationTriple> extractions = annotate(text).get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
String actual = StringUtils.join(
extractions.stream().map(x -> x.toString().substring(x.toString().indexOf("\t") + 1).toLowerCase()).sorted(),
"\n");
String expected = StringUtils.join(expectedSet.stream().map(String::toLowerCase).sorted(), "\n");
assertEquals(expected, actual);
Set<String> actual =
extractions.stream().map(x -> x.toString().substring(x.toString().indexOf("\t") + 1).toLowerCase())
.collect(Collectors.toSet());
Set<String> expected = expectedSet.stream().map(String::toLowerCase).collect(Collectors.toSet());
Sets.assertEquals(expected, actual, "expected", "actual", true,
() -> "Unexpected results processing " + text);
}

public void assertEntailed(String expected, String text) {
Expand Down Expand Up @@ -234,6 +237,11 @@ public void testChessIsNotAPhysicalSport() {
add("Chess\tis\tphysical sport");
add("Chess\tis\tsport");
}}, "Chess is a physical sport");
// TODO: this is failing either because a potential triple is
// being extracted when it shouldn't in RelationTripleSegmenter,
// or because there's a polarity check after that extraction and
// the polarity is registered as "up" and doesn't encode the
// negation
assertExtracted(new HashSet<String>() {{
}}, "Chess is not a physical sport");
}
Expand Down
14 changes: 13 additions & 1 deletion itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
Expand Up @@ -19,10 +19,22 @@ public class CoNLLUOutputterITest extends TestCase {
setProperty("parse.keepPunct", "true");
}});

/** Make sure that an invalid dependency type barfs. */
public void testInvalidOutputter() throws IOException {
try {
Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
pipeline.annotate(ann);
String actual = new CoNLLUOutputter("this should fail").print(ann);
throw new AssertionError("This should have failed");
} catch (IllegalArgumentException e) {
// yay
}
}

public void testSimpleSentence() throws IOException {
Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
pipeline.annotate(ann);
String actual = new CoNLLUOutputter().print(ann);
String actual = new CoNLLUOutputter("enhanced").print(ann);
String expected = "1\tCoNLL-U\tconll-u\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t3:nsubj\t_\n" +
"2\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\tcop\t3:cop\t_\n" +
"3\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +
Expand Down
@@ -0,0 +1,29 @@
package edu.stanford.nlp.pipeline;

import java.util.Properties;


public class TokenizerFrenchBenchmarkITest extends TokenizerBenchmarkTestCase {

@Override
public void setUp() {
// set up the pipeline
Properties props = new Properties();
props.put("annotators", "tokenize,ssplit,mwt");
props.put("tokenize.language", "fr");
//props.put("tokenize.options", "splitAll=false");
props.put("mwt.mappingFile",
"/u/nlp/data/stanford-corenlp/test/data/mwt/fr-mwt.tsv");
props.put("mwt.pos.model", "/u/nlp/data/stanford-corenlp/test/models/fr-mwt.tagger");
props.put("mwt.statisticalMappingFile",
"/u/nlp/data/stanford-corenlp/test/data/fr-mwt-statistical.tsv");
props.put("ssplit.isOneSentence", "true");
pipeline = new StanfordCoreNLP(props);
}

public void testOnDev() {
goldFilePath = "/u/nlp/data/stanford-corenlp/test/data/tokenize/fr_gsd-ud-dev.conllu";
runTest("dev", "fr", 0.90);
}

}
Expand Up @@ -200,13 +200,13 @@ public void testDependencyPathBetweenRegressions() throws IOException {
add("Geledi");
add("<-nmod:of-");
add("Sultanate");
add("<-conj-");
add("<-appos-");
add("Sultanate");
add("<-nmod:including-");
add("trade");
add("<-dobj-");
add("<-obj-");
add("dominated");
add("-nmod:in->");
add("-obl:in->");
add("Ages");
add("-compound->");
add("Middle");
Expand Down
2 changes: 1 addition & 1 deletion src/edu/stanford/nlp/ie/NERFeatureFactory.java
Expand Up @@ -92,7 +92,7 @@
* <tr><td> loadAuxClassifier </td><td>String</td><td>n/a</td><td>Path to auxiliary classifier to load.</td></tr>
* <tr><td> serializeTo</td><td>String</td><td>n/a</td><td>Path to serialize classifier to</td></tr>
* <tr><td> trainFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
* <tr><td> testFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
* <tr><td> testFile</td><td>String</td><td>n/a</td><td>Path of file to use as test data</td></tr>
* <tr><td> map</td><td>String</td><td>see below</td><td>This applies at training time or if testing on tab-separated column data. It says what is in each column. It doesn't apply when running on plain text data. The simplest scenario for training is having words and classes in two column. word=0,answer=1 is the default if conllNoTags is specified; otherwise word=0,tag=1,answer=2 is the default. But you can add other columns, such as for a part-of-speech tag, presences in a lexicon, etc. That would only be useful at runtime if you have part-of-speech information or whatever available and are passing it in with the tokens (that is, you can pass to classify CoreLabel tokens with additional fields stored in them).</td></tr>
* <tr><td> useWord</td><td>boolean</td><td>true</td><td>Gives you feature for w</td></tr>
* <tr><td> useBinnedLength</td><td>String</td><td>null</td><td>If non-null, treat as a sequence of comma separated integer bounds, where items above the previous bound up to the next bound are binned Len-<i>range</i></td></tr>
Expand Down
8 changes: 5 additions & 3 deletions src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java
Expand Up @@ -39,7 +39,9 @@ public class RelationTripleSegmenter {
// { fish like to swim }
add(SemgrexPattern.compile("{$}=verb >/.subj(:pass)?/ {}=subject >xcomp ( {}=object ?>appos {}=appos )"));
// { cats have tails }
add(SemgrexPattern.compile("{$}=verb ?>/aux(:pass)?/ {}=be >/.subj(:pass)?/ {}=subject >/[di]obj|xcomp/ ( {}=object ?>appos {}=appos )"));
// older versions of dependencies produce dobj, newer may just be obj.
// this expression accommodates both
add(SemgrexPattern.compile("{$}=verb ?>/aux(:pass)?/ {}=be >/.subj(:pass)?/ {}=subject >/[di]?obj|xcomp/ ( {}=object ?>appos {}=appos )"));
// { Tom and Jerry were fighting }
add(SemgrexPattern.compile("{$}=verb >/nsubj(:pass)?/ ( {}=subject >/conj:and/=subjIgnored {}=object )"));
// { mass of iron is 55amu }
Expand Down Expand Up @@ -385,13 +387,13 @@ public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens)
/** A set of valid arcs denoting a subject entity we are interested in */
public final Set<String> VALID_SUBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod:poss"); add("nmod:tmod"); add("expl");
add("nsubj"); add("case");
add("nsubj"); add("case"); add("mark");
}});

/** A set of valid arcs denoting an object entity we are interested in */
public final Set<String> VALID_OBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod"); add("nsubj"); add("nmod:*"); add("nmod:poss");
add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case");
add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case"); add("mark");
// add("advcl"); // Born in Hawaii, Obama is a US citizen; citizen -advcl-> Born.
}});

Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/pipeline/Annotator.java
Expand Up @@ -86,6 +86,7 @@ default void unmount() { }
String STANFORD_TOKENIZE = "tokenize";
String STANFORD_CLEAN_XML = "cleanxml";
String STANFORD_SSPLIT = "ssplit";
String STANFORD_MWT = "mwt";
String STANFORD_DOCDATE = "docdate";
String STANFORD_POS = "pos";
String STANFORD_LEMMA = "lemma";
Expand Down Expand Up @@ -123,6 +124,7 @@ default void unmount() { }
put(STANFORD_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_DOCDATE, new LinkedHashSet<>(Arrays.asList()));
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
Expand Down
8 changes: 8 additions & 0 deletions src/edu/stanford/nlp/pipeline/AnnotatorImplementations.java
Expand Up @@ -45,6 +45,14 @@ public Annotator wordToSentences(Properties properties) {
return new WordsToSentencesAnnotator(properties);
}

/**
* Multi-word-token, split tokens into words (e.g. "des" in French into "de" and "les")
*/
public Annotator multiWordToken(Properties props) {
// MWTAnnotator defaults to using "mwt." as prefix
return new MWTAnnotator("", props);
}

/**
* Set document date
*/
Expand Down
10 changes: 9 additions & 1 deletion src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java
Expand Up @@ -98,6 +98,12 @@ public CoNLLUOutputter() {
this(new Properties());
}

public CoNLLUOutputter(String type) {
this(new Properties() {{
setProperty("output.dependenciesType", type);
}});
}

public CoNLLUOutputter(Properties props) {
dependenciesType = props.getProperty("output.dependenciesType", "basic");
}
Expand All @@ -116,8 +122,10 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
} else if (dependenciesType.equals("enhancedPlusPlus")) {
SemanticGraph enhancedSg = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class);
writer.print(conllUWriter.printSemanticGraph(sg, enhancedSg));
} else {
} else if (dependenciesType.equals("basic")) {
writer.print(conllUWriter.printSemanticGraph(sg));
} else {
throw new IllegalArgumentException("CoNLLUOutputter: unknown dependencies type " + dependenciesType);
}
} else {
writer.print(conllUWriter.printPOSAnnotations(sentence));
Expand Down
16 changes: 16 additions & 0 deletions src/edu/stanford/nlp/pipeline/LanguageInfo.java
@@ -1,5 +1,8 @@
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;

import java.io.*;
import java.lang.reflect.Field;
import java.util.*;

Expand Down Expand Up @@ -54,6 +57,14 @@ public static String getLanguagePropertiesFile(String inputString) {
return languageToPropertiesFile.get(getLanguageFromString(inputString));
}

/** return an actual properties object for a given language **/
public static Properties getLanguageProperties(String inputString) throws IOException {
Properties props = new Properties();
InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(getLanguagePropertiesFile(inputString));
props.load(is);
return props;
}

/** convert various input strings to language enum **/
public static HumanLanguage getLanguageFromString(String inputString) {
if (inputString.toLowerCase().equals("arabic") || inputString.toLowerCase().equals("ar"))
Expand All @@ -72,6 +83,11 @@ public static HumanLanguage getLanguageFromString(String inputString) {
return null;
}

/** boolean saying whether String represents a Stanford CoreNLP supported language **/
public static boolean isStanfordCoreNLPSupportedLang(String lang) {
return (getLanguageFromString(lang) != null);
}

/** Check if language is a segmenter language, return boolean. **/
public static boolean isSegmenterLanguage(HumanLanguage language) {
return language == HumanLanguage.ARABIC || language == HumanLanguage.CHINESE;
Expand Down

0 comments on commit 8eaefe6

Please sign in to comment.