Merge branch 'master' of ssh://origin

stanfordnlp · Jun 24, 2019 · 8eaefe6 · 8eaefe6
1 parent 737422c
commit 8eaefe6
Show file tree

Hide file tree

Showing 15 changed files with 313 additions and 82 deletions.
diff --git a/build.xml b/build.xml
@@ -160,7 +160,8 @@
   <target name="itest" depends="classpath,compile"
           description="Run core integration tests">
     <echo message="${ant.project.name}" />
-    <junit fork="yes" maxmemory="12g" printsummary="off" outputtoformatters="false" forkmode="perTest" haltonfailure="true">
+    <junit fork="yes" maxmemory="12g" printsummary="off" outputtoformatters="false" forkmode="perTest" haltonfailure="no"
+           haltonerror="no" failureproperty="test.failed" errorproperty="test.failed">
       <classpath refid="classpath"/>
       <classpath path="${build.path}"/>
       <classpath path="${data.path}"/>
@@ -173,6 +174,7 @@
         </fileset>
       </batchtest>
     </junit>
+    <fail message="Test failure detected, check test results." if="test.failed" />
   </target>
 
   <target name="itest-many-docs" depends="classpath"

diff --git a/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java b/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java
@@ -5,12 +5,14 @@
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 import edu.stanford.nlp.util.CoreMap;
-import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.util.Sets;
 import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.*;
 
+import java.util.stream.Collectors;
+
 import static org.junit.Assert.*;
 
 /**
@@ -53,11 +55,12 @@ public void assertExtracted(String expected, String text) {
 
   public void assertExtracted(Set<String> expectedSet, String text) {
     Collection<RelationTriple> extractions = annotate(text).get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
-    String actual = StringUtils.join(
-        extractions.stream().map(x -> x.toString().substring(x.toString().indexOf("\t") + 1).toLowerCase()).sorted(),
-        "\n");
-    String expected = StringUtils.join(expectedSet.stream().map(String::toLowerCase).sorted(), "\n");
-    assertEquals(expected, actual);
+    Set<String> actual = 
+      extractions.stream().map(x -> x.toString().substring(x.toString().indexOf("\t") + 1).toLowerCase())
+      .collect(Collectors.toSet());
+    Set<String> expected = expectedSet.stream().map(String::toLowerCase).collect(Collectors.toSet());
+    Sets.assertEquals(expected, actual, "expected", "actual", true, 
+                      () -> "Unexpected results processing " + text);
   }
 
   public void assertEntailed(String expected, String text) {
@@ -234,6 +237,11 @@ public void testChessIsNotAPhysicalSport() {
       add("Chess\tis\tphysical sport");
       add("Chess\tis\tsport");
     }}, "Chess is a physical sport");
+    // TODO: this is failing either because a potential triple is
+    // being extracted when it shouldn't in RelationTripleSegmenter,
+    // or because there's a polarity check after that extraction and
+    // the polarity is registered as "up" and doesn't encode the
+    // negation
     assertExtracted(new HashSet<String>() {{
     }}, "Chess is not a physical sport");
   }

diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
@@ -19,10 +19,22 @@ public class CoNLLUOutputterITest extends TestCase {
                 setProperty("parse.keepPunct", "true");
             }});
 
+    /** Make sure that an invalid dependency type barfs. */
+    public void testInvalidOutputter() throws IOException {
+        try {
+            Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
+            pipeline.annotate(ann);
+            String actual = new CoNLLUOutputter("this should fail").print(ann);
+            throw new AssertionError("This should have failed");
+        } catch (IllegalArgumentException e) {
+            // yay
+        }
+    }
+
     public void testSimpleSentence() throws IOException {
         Annotation ann = new Annotation("CoNLL-U is neat. Better than XML.");
         pipeline.annotate(ann);
-        String actual = new CoNLLUOutputter().print(ann);
+        String actual = new CoNLLUOutputter("enhanced").print(ann);
         String expected = "1\tCoNLL-U\tconll-u\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t3:nsubj\t_\n" +
                 "2\tis\tbe\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t3\tcop\t3:cop\t_\n" +
                 "3\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +

diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerFrenchBenchmarkITest.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerFrenchBenchmarkITest.java
@@ -0,0 +1,29 @@
+package edu.stanford.nlp.pipeline;
+
+import java.util.Properties;
+
+
+public class TokenizerFrenchBenchmarkITest  extends TokenizerBenchmarkTestCase {
+
+    @Override
+    public void setUp() {
+        // set up the pipeline
+        Properties props = new Properties();
+        props.put("annotators", "tokenize,ssplit,mwt");
+        props.put("tokenize.language", "fr");
+        //props.put("tokenize.options", "splitAll=false");
+        props.put("mwt.mappingFile",
+                "/u/nlp/data/stanford-corenlp/test/data/mwt/fr-mwt.tsv");
+        props.put("mwt.pos.model", "/u/nlp/data/stanford-corenlp/test/models/fr-mwt.tagger");
+        props.put("mwt.statisticalMappingFile",
+                "/u/nlp/data/stanford-corenlp/test/data/fr-mwt-statistical.tsv");
+        props.put("ssplit.isOneSentence", "true");
+        pipeline = new StanfordCoreNLP(props);
+    }
+
+    public void testOnDev() {
+        goldFilePath = "/u/nlp/data/stanford-corenlp/test/data/tokenize/fr_gsd-ud-dev.conllu";
+        runTest("dev", "fr", 0.90);
+    }
+
+}
diff --git a/itest/src/edu/stanford/nlp/simple/SentenceAlgorithmsITest.java b/itest/src/edu/stanford/nlp/simple/SentenceAlgorithmsITest.java
@@ -200,13 +200,13 @@ public void testDependencyPathBetweenRegressions() throws IOException {
       add("Geledi");
       add("<-nmod:of-");
       add("Sultanate");
-      add("<-conj-");
+      add("<-appos-");
       add("Sultanate");
       add("<-nmod:including-");
       add("trade");
-      add("<-dobj-");
+      add("<-obj-");
       add("dominated");
-      add("-nmod:in->");
+      add("-obl:in->");
       add("Ages");
       add("-compound->");
       add("Middle");

diff --git a/src/edu/stanford/nlp/ie/NERFeatureFactory.java b/src/edu/stanford/nlp/ie/NERFeatureFactory.java
@@ -92,7 +92,7 @@
  * <tr><td> loadAuxClassifier </td><td>String</td><td>n/a</td><td>Path to auxiliary classifier to load.</td></tr>
  * <tr><td> serializeTo</td><td>String</td><td>n/a</td><td>Path to serialize classifier to</td></tr>
  * <tr><td> trainFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
- * <tr><td> testFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
+ * <tr><td> testFile</td><td>String</td><td>n/a</td><td>Path of file to use as test data</td></tr>
  * <tr><td> map</td><td>String</td><td>see below</td><td>This applies at training time or if testing on tab-separated column data.  It says what is in each column.  It doesn't apply when running on plain text data.  The simplest scenario for training is having words and classes in two column.  word=0,answer=1 is the default if conllNoTags is specified; otherwise word=0,tag=1,answer=2 is the default.  But you can add other columns, such as for a part-of-speech tag, presences in a lexicon, etc.  That would only be useful at runtime if you have part-of-speech information or whatever available and are passing it in with the tokens (that is, you can pass to classify CoreLabel tokens with additional fields stored in them).</td></tr>
  * <tr><td> useWord</td><td>boolean</td><td>true</td><td>Gives you feature for w</td></tr>
  * <tr><td> useBinnedLength</td><td>String</td><td>null</td><td>If non-null, treat as a sequence of comma separated integer bounds, where items above the previous bound up to the next bound are binned Len-<i>range</i></td></tr>

diff --git a/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java b/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java
@@ -39,7 +39,9 @@ public class RelationTripleSegmenter {
     // { fish like to swim }
     add(SemgrexPattern.compile("{$}=verb >/.subj(:pass)?/ {}=subject >xcomp ( {}=object ?>appos {}=appos )"));
     // { cats have tails }
-    add(SemgrexPattern.compile("{$}=verb ?>/aux(:pass)?/ {}=be >/.subj(:pass)?/ {}=subject >/[di]obj|xcomp/ ( {}=object ?>appos {}=appos )"));
+    // older versions of dependencies produce dobj, newer may just be obj.
+    // this expression accommodates both
+    add(SemgrexPattern.compile("{$}=verb ?>/aux(:pass)?/ {}=be >/.subj(:pass)?/ {}=subject >/[di]?obj|xcomp/ ( {}=object ?>appos {}=appos )"));
     // { Tom and Jerry were fighting }
     add(SemgrexPattern.compile("{$}=verb >/nsubj(:pass)?/ ( {}=subject >/conj:and/=subjIgnored {}=object )"));
     // { mass of iron is 55amu }
@@ -385,13 +387,13 @@ public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens)
   /** A set of valid arcs denoting a subject entity we are interested in */
   public final Set<String> VALID_SUBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
     add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod:poss"); add("nmod:tmod"); add("expl");
-    add("nsubj"); add("case");
+    add("nsubj"); add("case"); add("mark");
   }});
 
   /** A set of valid arcs denoting an object entity we are interested in */
   public final Set<String> VALID_OBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
     add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod"); add("nsubj"); add("nmod:*"); add("nmod:poss");
-    add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case");
+    add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case"); add("mark");
     // add("advcl"); // Born in Hawaii, Obama is a US citizen; citizen -advcl-> Born.
   }});
 

diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -86,6 +86,7 @@ default void unmount() { }
   String STANFORD_TOKENIZE = "tokenize";
   String STANFORD_CLEAN_XML = "cleanxml";
   String STANFORD_SSPLIT = "ssplit";
+  String STANFORD_MWT = "mwt";
   String STANFORD_DOCDATE = "docdate";
   String STANFORD_POS = "pos";
   String STANFORD_LEMMA = "lemma";
@@ -123,6 +124,7 @@ default void unmount() { }
     put(STANFORD_TOKENIZE,                 new LinkedHashSet<>(Arrays.asList()));
     put(STANFORD_CLEAN_XML,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_SSPLIT,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
     put(STANFORD_DOCDATE,                  new LinkedHashSet<>(Arrays.asList()));
     put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
     put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));

diff --git a/src/edu/stanford/nlp/pipeline/AnnotatorImplementations.java b/src/edu/stanford/nlp/pipeline/AnnotatorImplementations.java
@@ -45,6 +45,14 @@ public Annotator wordToSentences(Properties properties) {
     return new WordsToSentencesAnnotator(properties);
   }
 
+  /**
+   * Multi-word-token, split tokens into words (e.g. "des" in French into "de" and "les")
+   */
+  public Annotator multiWordToken(Properties props) {
+    // MWTAnnotator defaults to using "mwt." as prefix
+    return new MWTAnnotator("", props);
+  }
+
   /**
    * Set document date
    */

diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java b/src/edu/stanford/nlp/pipeline/CoNLLUOutputter.java
@@ -98,6 +98,12 @@ public CoNLLUOutputter() {
     this(new Properties());
   }
 
+  public CoNLLUOutputter(String type) {
+    this(new Properties() {{
+           setProperty("output.dependenciesType", type);
+    }});
+  }
+
   public CoNLLUOutputter(Properties props) {
     dependenciesType = props.getProperty("output.dependenciesType", "basic");
   }
@@ -116,8 +122,10 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
         } else if (dependenciesType.equals("enhancedPlusPlus")) {
           SemanticGraph enhancedSg = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class);
           writer.print(conllUWriter.printSemanticGraph(sg, enhancedSg));
-        } else {
+        } else if (dependenciesType.equals("basic")) {
           writer.print(conllUWriter.printSemanticGraph(sg));
+        } else {
+          throw new IllegalArgumentException("CoNLLUOutputter: unknown dependencies type " + dependenciesType);
         }
       } else {
         writer.print(conllUWriter.printPOSAnnotations(sentence));

diff --git a/src/edu/stanford/nlp/pipeline/LanguageInfo.java b/src/edu/stanford/nlp/pipeline/LanguageInfo.java
@@ -1,5 +1,8 @@
 package edu.stanford.nlp.pipeline;
 
+import edu.stanford.nlp.io.IOUtils;
+
+import java.io.*;
 import java.lang.reflect.Field;
 import java.util.*;
 
@@ -54,6 +57,14 @@ public static String getLanguagePropertiesFile(String inputString) {
     return languageToPropertiesFile.get(getLanguageFromString(inputString));
   }
 
+  /** return an actual properties object for a given language **/
+  public static Properties getLanguageProperties(String inputString) throws IOException {
+    Properties props = new Properties();
+    InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(getLanguagePropertiesFile(inputString));
+    props.load(is);
+    return props;
+  }
+
   /** convert various input strings to language enum **/
   public static HumanLanguage getLanguageFromString(String inputString) {
     if (inputString.toLowerCase().equals("arabic") || inputString.toLowerCase().equals("ar"))
@@ -72,6 +83,11 @@ public static HumanLanguage getLanguageFromString(String inputString) {
       return null;
   }
 
+  /** boolean saying whether String represents a Stanford CoreNLP supported language **/
+  public static boolean isStanfordCoreNLPSupportedLang(String lang) {
+    return (getLanguageFromString(lang) != null);
+  }
+
   /** Check if language is a segmenter language, return boolean. **/
   public static boolean isSegmenterLanguage(HumanLanguage language) {
     return language == HumanLanguage.ARABIC || language == HumanLanguage.CHINESE;