Tests still failing, but OpenIE is starting to come together

stanfordnlp · Mar 12, 2015 · 4246a16 · 4246a16
1 parent 30210eb
commit 4246a16
Show file tree

Hide file tree

Showing 129 changed files with 9,063 additions and 2,082 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 Stanford CoreNLP
 ================
 
-Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, and mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for Arabic, (mainland) Chinese, French, and German. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools on it with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, government, and industry.
+Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, and mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for Arabic, (mainland) Chinese, French, German, and Spanish. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools on it with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, government, and industry.
 
-The Stanford CoreNLP code is written in Java and licensed under the GNU General Public License (v3 or later). Note that this is the full GPL, which allows many free uses, but not its use in distributed proprietary software.
+The Stanford CoreNLP code is written in Java and licensed under the GNU General Public License (v3 or later). Note that this is the full GPL, which allows many free uses, but not its use in proprietary software that you distribute.
 
 You can find releases of Stanford CoreNLP on [Maven Central](http://search.maven.org/#browse%7C11864822).
 
-You can find more explanation and documentation of Stanford CoreNLP on [the Stanford CoreNLP homepage](http://nlp.stanford.edu/software/corenlp.shtml#Demo).
+You can find more explanation and documentation on [the Stanford CoreNLP homepage](http://nlp.stanford.edu/software/corenlp.shtml#Demo).
 
-The most recent models associated the code in this repository can be found [here](http://nlp.stanford.edu/software/stanford-corenlp-models-current.jar).
+The most recent models associated with the code in the HEAD of this repository can be found [here](http://nlp.stanford.edu/software/stanford-corenlp-models-current.jar).
 
 For information about making contributions to Stanford CoreNLP, see the file `CONTRIBUTING.md`.
diff --git a/build.xml b/build.xml
@@ -107,6 +107,8 @@
         <compilerarg value="-Xmaxwarns"/>
         <compilerarg value="10000"/> -->
     </javac>
+    <copy file="${source.path}/edu/stanford/nlp/trees/ENUniversalPOS.tsurgeon"
+         todir="${build.path}/edu/stanford/nlp/trees/" />
   </target>
 
   <target name="test" depends="classpath,compile"

diff --git a/itest/src/edu/stanford/nlp/international/spanish/SpanishTokenizerITest.java b/itest/src/edu/stanford/nlp/international/spanish/SpanishTokenizerITest.java
@@ -65,7 +65,7 @@ public class SpanishTokenizerITest extends TestCase {
 
   public void testSpanishTokenizerWord() {
     assert (ptbInputs.length == ptbGold.length);
-    final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
+    final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.ancoraFactory();
     tf.setOptions("");
     tf.setOptions("tokenizeNLs");
 

diff --git a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
@@ -1,6 +1,5 @@
 package edu.stanford.nlp.ling.tokensregex;
 
-import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.pipeline.*;
 import edu.stanford.nlp.util.CoreMap;
@@ -9,10 +8,8 @@
 import edu.stanford.nlp.util.Timing;
 import junit.framework.TestCase;
 
-import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -1526,4 +1523,5 @@ public void testCaseInsensitive2(){
     boolean match = m.find();
     assertTrue(match);
   }
+
 }
diff --git a/itest/src/edu/stanford/nlp/naturalli/NaturalLogicAnnotatorITest.java b/itest/src/edu/stanford/nlp/naturalli/NaturalLogicAnnotatorITest.java
@@ -0,0 +1,44 @@
+package edu.stanford.nlp.naturalli;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import org.junit.Test;
+
+import java.util.List;
+import java.util.Properties;
+
+import static org.junit.Assert.*;
+
+/**
+ * A lightweight test to makes sure the annotator runs in the pipeline.
+ * For more in-depth tests, see {@link edu.stanford.nlp.naturalli.OperatorScopeITest} and
+ * {@link edu.stanford.nlp.naturalli.PolarityITest}.
+ *
+ * @author Gabor Angeli
+ */
+public class NaturalLogicAnnotatorITest {
+
+  @Test
+  public void testAnnotatorRuns() {
+    // Run pipeline
+    StanfordCoreNLP pipeline = new StanfordCoreNLP(new Properties(){{
+      setProperty("annotators", "tokenize,ssplit,pos,lemma,parse,natlog");
+      setProperty("ssplit.isOneSentence", "true");
+      setProperty("tokenize.class", "PTBTokenizer");
+      setProperty("tokenize.language", "en");
+      setProperty("enforceRequirements", "true");
+    }});
+    Annotation ann = new Annotation("All cats have tails");
+    pipeline.annotate(ann);
+
+    // Check output
+    List<CoreLabel> tokens = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0).get(CoreAnnotations.TokensAnnotation.class);
+    assertTrue(tokens.get(0).containsKey(NaturalLogicAnnotations.OperatorAnnotation.class));
+    assertTrue(tokens.get(0).get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards());
+    assertTrue(tokens.get(1).get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards());
+    assertTrue(tokens.get(2).get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards());
+    assertTrue(tokens.get(3).get(NaturalLogicAnnotations.PolarityAnnotation.class).isUpwards());
+  }
+}
diff --git a/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java b/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java
@@ -0,0 +1,156 @@
+package edu.stanford.nlp.naturalli;
+
+import edu.stanford.nlp.ie.util.RelationTriple;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.StringUtils;
+import org.junit.Test;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.*;
+
+/**
+ * Test the natural logic OpenIE extractor at {@link edu.stanford.nlp.naturalli.OpenIE}.
+ *
+ * @author Gabor Angeli
+ */
+public class OpenIEITest {
+  protected static StanfordCoreNLP pipeline = new StanfordCoreNLP(new Properties(){{
+    setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
+    setProperty("ssplit.isOneSentence", "true");
+    setProperty("tokenize.class", "PTBTokenizer");
+    setProperty("tokenize.language", "en");
+    setProperty("enforceRequirements", "true");
+  }});
+
+  public CoreMap annotate(String text) {
+    Annotation ann = new Annotation(text);
+    pipeline.annotate(ann);
+    return ann.get(CoreAnnotations.SentencesAnnotation.class).get(0);
+  }
+
+  public void assertExtracted(String expected, String text) {
+    boolean found = false;
+    Collection<RelationTriple> extractions = annotate(text).get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
+    for (RelationTriple extraction : extractions) {
+      if (extraction.toString().equals("1.0\t" + expected)) {
+        found = true;
+      }
+    }
+    assertTrue("The extraction '" + expected + "' was not found in '" + text + "'", found);
+  }
+
+  public void assertExtracted(Set<String> expected, String text) {
+    Collection<RelationTriple> extractions = annotate(text).get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
+    Set<String> guess = extractions.stream().filter(x -> x.confidence > 0.1).map(RelationTriple::toString).collect(Collectors.toSet());
+    assertEquals(StringUtils.join(expected.stream().sorted(), "\n").toLowerCase(), StringUtils.join(guess.stream().map( x -> x.substring(x.indexOf("\t") + 1) ).sorted(), "\n").toLowerCase());
+  }
+
+  public void assertEntailed(String expected, String text) {
+    boolean found = false;
+    Collection<SentenceFragment> extractions = annotate(text).get(NaturalLogicAnnotations.EntailedSentencesAnnotation.class);
+    for (SentenceFragment extraction : extractions) {
+      if (extraction.toString().equals(expected)) {
+        found = true;
+      }
+    }
+    assertTrue("The sentence '" + expected + "' was not entailed from '" + text + "'", found);
+  }
+
+
+  @Test
+  public void testAnnotatorRuns() {
+    annotate("all cats have tails");
+  }
+
+  @Test
+  public void testBasicEntailments() {
+    assertEntailed("some cats have tails", "some blue cats have tails");
+    assertEntailed("blue cats have tails", "some blue cats have tails");
+    assertEntailed("cats have tails",      "some blue cats have tails");
+  }
+
+  @Test
+  public void testBasicExtractions() {
+    assertExtracted("cats\thave\ttails", "some cats have tails");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiOne() {
+    assertExtracted(new HashSet<String>() {{
+      add("Barack Hussein Obama II\tis 44th and current President of\tUnited States");
+      add("Barack Hussein Obama II\tis 44th President of\tUnited States");
+      add("Barack Hussein Obama II\tis current President of\tUnited States");
+      add("Barack Hussein Obama II\tis President of\tUnited States");
+      add("Barack Hussein Obama II\tis\tPresident");
+      add("Barack Hussein Obama II\tis\tcurrent President");
+      add("Barack Hussein Obama II\tis\t44th President");
+    }}, "Barack Hussein Obama II is the 44th and current President of the United States, and the first African American to hold the office.");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiTwo() {
+    assertExtracted(new HashSet<String>() {{
+      add("Obama\tis graduate of\tColumbia University");
+      add("Obama\tis graduate of\tHarvard Law School");
+      add("Obama\tborn in\tHonolulu Hawaii");
+      add("he\tserved as\tpresident of Harvard Law Review");
+      add("he\tserved as\tpresident");
+      add("Obama\tis\tgraduate");
+    }}, "Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he served as president of the Harvard Law Review.");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiThree() {
+    assertExtracted(new HashSet<String>() {{
+      add("He\twas\tcommunity organizer in Chicago");
+      add("He\twas\tcommunity organizer");
+      add("He\tearning\tlaw degree");
+    }}, "He was a community organizer in Chicago before earning his law degree.");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiFour() {
+    assertExtracted(new HashSet<String>() {{
+      add("He\tworked as\tcivil rights attorney");
+      add("He\tworked as\trights attorney");
+      add("He\ttaught\tconstitutional law");
+      add("He\ttaught\tlaw");
+      add("He\ttaught at\tUniversity of Chicago Law School");
+      add("He\ttaught at\tUniversity of Chicago Law School from 1992");
+      add("He\ttaught at\tUniversity");
+      add("He\ttaught to\t2004");  // shouldn't be here, but sometimes appears?
+    }}, "He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School from 1992 to 2004.");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiFive() {
+    assertExtracted(new HashSet<String>() {{
+      add("He\tserved\tthree terms");
+      add("He\trepresenting\t13th District in Illinois Senate");
+      add("He\trepresenting\t13th District");
+      add("He\trepresenting\tDistrict in Illinois Senate");
+      add("He\trepresenting\tDistrict");
+      add("He\trunning unsuccessfully for\tUnited States House of Representatives in 2000");
+      add("He\trunning unsuccessfully for\tUnited States House of Representatives");
+      add("He\trunning unsuccessfully for\tUnited States House");
+      add("He\trunning for\tUnited States House of Representatives in 2000");
+      add("He\trunning for\tUnited States House of Representatives");
+      add("He\trunning for\tUnited States House");
+    }}, "He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000.");
+  }
+
+  @Test
+  public void testExtractionsObamaWikiSix() {
+    assertExtracted(new HashSet<String>() {{
+      add("He\tdefeated\tRepublican nominee John McCain");
+      add("He\tdefeated\tnominee John McCain");
+      add("He\twas inaugurated as\tpresident on January 20 2009");
+      add("He\twas inaugurated as\tpresident");
+    }}, "He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009.");
+  }
+}