Fix the OpenIE ITest

stanfordnlp · Sep 29, 2015 · 6207170 · 6207170
1 parent 5d538e8
commit 6207170
Show file tree

Hide file tree

Showing 37 changed files with 303,835 additions and 1,988 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 Stanford CoreNLP
 ================
 
-Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize and interpret dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases or word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for (Modern Standard) Arabic, (mainland) Chinese, French, German, and Spanish. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, industry, and government.
+Stanford CoreNLP provides a set of natural language analysis tools written in Java. It can take raw human language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, and mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It was originally developed for English, but now also provides varying levels of support for Arabic, (mainland) Chinese, French, German, and Spanish. Stanford CoreNLP is an integrated framework, which make it very easy to apply a bunch of language analysis tools to a piece of text. Starting from plain text, you can run all the tools on it with just two lines of code. Its analyses provide the foundational building blocks for higher-level and domain-specific text understanding applications. Stanford CoreNLP is a set of stable and well-tested natural language processing tools, widely used by various groups in academia, government, and industry.
 
 The Stanford CoreNLP code is written in Java and licensed under the GNU General Public License (v3 or later). Note that this is the full GPL, which allows many free uses, but not its use in proprietary software that you distribute.
 

diff --git a/build.gradle b/build.gradle
@@ -47,7 +47,6 @@ task listDeps << {
 
 dependencies {
   compile fileTree(dir: 'lib', include: '*.jar')
-  testCompile fileTree(dir: 'liblocal', include: '*.jar')
 }
 
 // Eclipse plugin setup

diff --git a/build.xml b/build.xml
@@ -26,10 +26,6 @@
         <include name="*.jar"/>
         <exclude name="javanlp*"/>
       </fileset>
-      <fileset dir="${basedir}/liblocal">
-        <include name="*.jar"/>
-        <exclude name="javanlp*"/>
-      </fileset>
     </path>
   </target>
 

diff --git a/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java b/itest/src/edu/stanford/nlp/naturalli/OpenIEITest.java
@@ -10,7 +10,6 @@
 import org.junit.Test;
 
 import java.util.*;
-import java.util.stream.Collectors;
 
 import static org.junit.Assert.*;
 
@@ -52,10 +51,13 @@ public void assertExtracted(String expected, String text) {
     assertTrue("The extraction '" + expected + "' was not found in '" + text + "'", found);
   }
 
-  public void assertExtracted(Set<String> expected, String text) {
+  public void assertExtracted(Set<String> expectedSet, String text) {
     Collection<RelationTriple> extractions = annotate(text).get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
-    Set<String> guess = extractions.stream().filter(x -> x.confidence > 0.1).map(RelationTriple::toString).collect(Collectors.toSet());
+    String actual = StringUtils.join(
-    assertEquals(StringUtils.join(expected.stream().sorted(), "\n").toLowerCase(), StringUtils.join(guess.stream().map(x -> x.substring(x.indexOf("\t") + 1)).sorted(), "\n").toLowerCase());
+        extractions.stream().map(x -> x.toString().substring(x.toString().indexOf("\t") + 1).toLowerCase()).sorted(),
+        "\n");
+    String expected = StringUtils.join(expectedSet.stream().map(String::toLowerCase).sorted(), "\n");
+    assertEquals(expected, actual);
   }
 
   public void assertEntailed(String expected, String text) {
@@ -89,9 +91,9 @@ public void testBasicExtractions() {
   @Test
   public void testExtractionsGeorgeBoyd() {
     assertExtracted(new HashSet<String>() {{
+      add("George Boyd\tjoined on\t21 february 2013");
       add("George Boyd\tjoined for\tremainder");
       add("George Boyd\tjoined for\tremainder of season");
-      add("George Boyd\tjoined on\t21 february 2013");
       add("George Boyd\tjoined on\tloan");
       add("George Boyd\tjoined on\tloan from peterborough united");
     }}, "On 21 February 2013 George Boyd joined on loan from Peterborough United for the remainder of the season.");

diff --git a/lib/README b/lib/README
@@ -128,6 +128,20 @@ Not needed by CoreNLP distributions.  In core, used only by web apps (jsp pages
 LAST UPDATE: 2013-06-05
 LAST UPDATE BY: Sonal Gupta
 
+-----------------------------------------------------------------------
+commons-math3.jar
+ORIGINAL JAR NAME: commons-math3-3.5.jar
+VERSION: 3.5
+RELEASE DATE: 3-14-2015
+SOURCE AVAILABLE: yes
+DESCRIPTION: self contained fast math routines
+URL: http://commons.apache.org/lang/
+
+USED BY: edu.stanford.nlp.loglinear
+
+LAST UPDATE: 2015-9-25
+LAST UPDATE BY: Keenon Werling
+
 -----------------------------------------------------------------------
 commons-logging.jar
 

diff --git a/lib/antlr-runtime-3.1.2.jar b/lib/antlr-runtime-3.1.2.jar
diff --git a/lib/commons-math3.jar b/lib/commons-math3.jar
diff --git a/lib/hamcrest-core-1.3.jar b/lib/hamcrest-core-1.3.jar
diff --git a/lib/javaruntype-1.2.jar b/lib/javaruntype-1.2.jar
diff --git a/lib/junit-quickcheck-core-0.4-beta-3.jar b/lib/junit-quickcheck-core-0.4-beta-3.jar
diff --git a/lib/junit-quickcheck-generators-0.4-beta-3.jar b/lib/junit-quickcheck-generators-0.4-beta-3.jar
diff --git a/lib/junit-theories-4.12.jar b/lib/junit-theories-4.12.jar
diff --git a/lib/ognl-3.0.5.jar b/lib/ognl-3.0.5.jar
diff --git a/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java b/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java
@@ -378,7 +378,7 @@ protected double multiThreadGradient(List<Integer> docIDs, boolean calculateEmpi
       }
     }
 
-    // TODO: this is a huge amount of machinery for no discernible reason
+    // TODO: this is a huge amount of machinery for no discernable reason
     MulticoreWrapper<Pair<Integer, List<Integer>>, Pair<Integer, Double>> wrapper =
       new MulticoreWrapper<Pair<Integer, List<Integer>>, Pair<Integer, Double>>(multiThreadGrad, (calculateEmpirical ? expectedAndEmpiricalThreadProcessor : expectedThreadProcessor) );
 
@@ -416,6 +416,7 @@ protected double multiThreadGradient(List<Integer> docIDs, boolean calculateEmpi
   @Override
   public void calculate(double[] x) {
 
+    double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point
     // final double[][] weights = to2D(x);
     to2D(x, weights);
     setWeights(weights);
@@ -425,7 +426,7 @@ public void calculate(double[] x) {
     // double[][] E = empty2D();
     clear2D(E);
 
-    double prob = regularGradientAndValue(); // the log prob of the sequence given the model, which is the negation of value at this point
+    prob = regularGradientAndValue();
 
     if (Double.isNaN(prob)) { // shouldn't be the case
       throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()" +
@@ -463,6 +464,7 @@ public int dataDimension() {
 
   @Override
   public void calculateStochastic(double[] x, double [] v, int[] batch) {
+    double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point
     to2D(x, weights);
     setWeights(weights);
 
@@ -474,10 +476,8 @@ public void calculateStochastic(double[] x, double [] v, int[] batch) {
 
     // iterate over all the documents
     List<Integer> docIDs = new ArrayList<Integer>(batch.length);
-    for (int item : batch) {
+    for (int m=0; m < batch.length; m++) docIDs.add(batch[m]);
-      docIDs.add(item);
+    prob = multiThreadGradient(docIDs, false); 
-    }
-    double prob = multiThreadGradient(docIDs, false);  // the log prob of the sequence given the model, which is the negation of value at this point
 
     if (Double.isNaN(prob)) { // shouldn't be the case
       throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()");
@@ -522,6 +522,7 @@ public void calculateStochastic(double[] x, double [] v, int[] batch) {
    */
   @Override
   public double calculateStochasticUpdate(double[] x, double xScale, int[] batch, double gScale) {
+    double prob = 0.0; // the log prob of the sequence given the model, which is the negation of value at this point
     // int[][] wis = getWeightIndices();
     to2D(x, xScale, weights);
     setWeights(weights);
@@ -541,10 +542,8 @@ public double calculateStochasticUpdate(double[] x, double xScale, int[] batch,
 
     // iterate over all the documents
     List<Integer> docIDs = new ArrayList<Integer>(batch.length);
-    for (int item : batch) {
+    for (int m=0; m < batch.length; m++) docIDs.add(batch[m]);
-      docIDs.add(item);
+    prob = multiThreadGradient(docIDs, true); 
-    }
-    double prob = multiThreadGradient(docIDs, true); // the log prob of the sequence given the model, which is the negation of value at this point
 
     if (Double.isNaN(prob)) { // shouldn't be the case
       throw new RuntimeException("Got NaN for prob in CRFLogConditionalObjectiveFunction.calculate()");
@@ -581,10 +580,8 @@ public void calculateStochasticGradient(double[] x, int[] batch) {
 
     // iterate over all the documents
     List<Integer> docIDs = new ArrayList<Integer>(batch.length);
-    for (int item : batch) {
+    for (int m=0; m < batch.length; m++) docIDs.add(batch[m]);
-      docIDs.add(item);
+    multiThreadGradient(docIDs, true); 
-    }
-    multiThreadGradient(docIDs, true);
 
     int index = 0;
     for (int i = 0; i < E.length; i++) {
@@ -865,7 +862,7 @@ public void to2D(double[] weights1D, double wScale, double[][] newWeights) {
   public static void clear2D(double[][] arr2D) {
     for (int i = 0; i < arr2D.length; i++)
       for (int j = 0; j < arr2D[i].length; j++)
-        arr2D[i][j] = 0.0;
+        arr2D[i][j] = 0;
   }
 
   public static void to1D(double[][] weights, double[] newWeights) {
@@ -917,5 +914,4 @@ protected double[][] empty2D() {
   public int[][] getLabels() {
     return labels;
   }
-
 }
diff --git a/src/edu/stanford/nlp/io/IOUtils.java b/src/edu/stanford/nlp/io/IOUtils.java
@@ -494,8 +494,6 @@ public static InputStream getInputStreamFromURLOrClasspathOrFileSystem(String te
 
 
   // todo [cdm 2015]: I think GZIPInputStream has its own buffer and so we don't need to buffer in that case.
-  // todo: Though it's default size is 512 bytes so need to make 8K in constructor. Or else buffering outside gzip is faster
-  // todo: final InputStream is = new GZIPInputStream( new FileInputStream( file ), 65536 );
   /**
    * Quietly opens a File. If the file ends with a ".gz" extension,
    * automatically opens a GZIPInputStream to wrap the constructed
@@ -1838,7 +1836,7 @@ public static PrintWriter encodedOutputStreamPrintWriter(OutputStream stream,
 
   /**
    * A raw file copy function -- this is not public since no error checks are made as to the
-   * consistency of the file being copied. Use instead:
+   * consistency of the filed being copied. Use instead:
    * @see IOUtils#cp(java.io.File, java.io.File, boolean)
    * @param source The source file. This is guaranteed to exist, and is guaranteed to be a file.
    * @param target The target file.

diff --git a/src/edu/stanford/nlp/loglinear/inference/TableFactor.java b/src/edu/stanford/nlp/loglinear/inference/TableFactor.java
@@ -3,6 +3,7 @@
 import edu.stanford.nlp.loglinear.model.ConcatVector;
 import edu.stanford.nlp.loglinear.model.GraphicalModel;
 import edu.stanford.nlp.loglinear.model.NDArrayDoubles;
+import org.apache.commons.math3.util.FastMath;
 
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -172,7 +173,7 @@ public double[][] getSummedMarginals() {
         while (true) {
             double v = getAssignmentLogValue(assignment);
             for (int i = 0; i < neighborIndices.length; i++) {
-                results[i][assignment[i]] += Math.exp(v - maxValues[i][assignment[i]]);
+                results[i][assignment[i]] += FastMath.exp(v - maxValues[i][assignment[i]]);
             }
             // This mutates the resultAssignment[] array, rather than creating a new one
             if (secondFastPassByReferenceIterator.hasNext()) {
@@ -186,7 +187,7 @@ public double[][] getSummedMarginals() {
         for (int i = 0; i < neighborIndices.length; i++) {
             double sum = 0.0;
             for (int j = 0; j < results[i].length; j++) {
-                results[i][j] = Math.exp(maxValues[i][j]) * results[i][j];
+                results[i][j] = FastMath.exp(maxValues[i][j]) * results[i][j];
                 sum += results[i][j];
             }
             if (Double.isInfinite(sum)) {
@@ -295,7 +296,7 @@ public TableFactor sumOut(int variable) {
                     for (int j = 0; j < getDimensions()[1]; j++) {
                         int index = k + j;
                         if (Double.isFinite(max[j])) {
-                            marginalized.values[j] += Math.exp(values[index] - max[j]);
+                            marginalized.values[j] += FastMath.exp(values[index] - max[j]);
                         }
                     }
                 }
@@ -304,7 +305,7 @@ public TableFactor sumOut(int variable) {
 
                 for (int j = 0; j < getDimensions()[1]; j++) {
                     if (Double.isFinite(max[j])) {
-                        marginalized.values[j] = max[j] + Math.log(marginalized.values[j]);
+                        marginalized.values[j] = max[j] + FastMath.log(marginalized.values[j]);
                     }
                     else {
                         marginalized.values[j] = max[j];
@@ -343,7 +344,7 @@ public TableFactor sumOut(int variable) {
                     for (int j = 0; j < getDimensions()[1]; j++) {
                         int index = k + j;
                         if (Double.isFinite(max[i])) {
-                            marginalized.values[i] += Math.exp(values[index] - max[i]);
+                            marginalized.values[i] += FastMath.exp(values[index] - max[i]);
                         }
                     }
                 }
@@ -352,7 +353,7 @@ public TableFactor sumOut(int variable) {
 
                 for (int i = 0; i < getDimensions()[0]; i++) {
                     if (Double.isFinite(max[i])) {
-                        marginalized.values[i] = max[i] + Math.log(marginalized.values[i]);
+                        marginalized.values[i] = max[i] + FastMath.log(marginalized.values[i]);
                     }
                     else {
                         marginalized.values[i] = max[i];
@@ -370,11 +371,11 @@ public TableFactor sumOut(int variable) {
             TableFactor maxValues = maxOut(variable);
 
             // Then we do the sum against an offset from the pivots
-            TableFactor marginalized = marginalize(variable, 0, (marginalizedVariableValue, assignment) -> (a, b) -> a + Math.exp(b - maxValues.getAssignmentLogValue(assignment)));
+            TableFactor marginalized = marginalize(variable, 0, (marginalizedVariableValue, assignment) -> (a, b) -> a + FastMath.exp(b - maxValues.getAssignmentLogValue(assignment)));
 
             // Then we factor the max values back in, and
             for (int[] assignment : marginalized) {
-                marginalized.setAssignmentLogValue(assignment, maxValues.getAssignmentLogValue(assignment) + Math.log(marginalized.getAssignmentLogValue(assignment)));
+                marginalized.setAssignmentLogValue(assignment, maxValues.getAssignmentLogValue(assignment) + FastMath.log(marginalized.getAssignmentLogValue(assignment)));
             }
 
             return marginalized;
@@ -509,14 +510,14 @@ public double valueSum() {
 
         double sumExp = 0.0;
         for (int[] assignment : this) {
-            sumExp += Math.exp(getAssignmentLogValue(assignment) - max);
+            sumExp += FastMath.exp(getAssignmentLogValue(assignment) - max);
         }
 
-        return sumExp * Math.exp(max);
+        return sumExp * FastMath.exp(max);
     }
 
     /**
-     * Just a pass through to the NDArray version, plus a Math.exp to ensure that to the outside world the TableFactor
+     * Just a pass through to the NDArray version, plus a FastMath.exp to ensure that to the outside world the TableFactor
      * doesn't look like it's in log-space
      *
      * @param assignment a list of variable settings, in the same order as the neighbors array of the factor
@@ -526,19 +527,19 @@ public double valueSum() {
     public double getAssignmentValue(int[] assignment) {
         double d = super.getAssignmentValue(assignment);
         // if (d == null) d = Double.NEGATIVE_INFINITY;
-        return Math.exp(d);
+        return FastMath.exp(d);
     }
 
     /**
-     * Just a pass through to the NDArray version, plus a Math.log to ensure that to the outside world the TableFactor
+     * Just a pass through to the NDArray version, plus a FastMath.log to ensure that to the outside world the TableFactor
      * doesn't look like it's in log-space
      *
      * @param assignment a list of variable settings, in the same order as the neighbors array of the factor
      * @param value the value to put into the factor table
      */
     @Override
     public void setAssignmentValue(int[] assignment, double value) {
-        super.setAssignmentValue(assignment, Math.log(value));
+        super.setAssignmentValue(assignment, FastMath.log(value));
     }
 
     ////////////////////////////////////////////////////////////////////////////
@@ -651,9 +652,9 @@ private void normalizeLogArr(double[] arr) {
         }
         double expSum = 0.0;
         for (double d : arr) {
-            expSum += Math.exp(d-max);
+            expSum += FastMath.exp(d-max);
         }
-        double logSumExp = max + Math.log(expSum);
+        double logSumExp = max + FastMath.log(expSum);
 
         if (Double.isInfinite(logSumExp)) {
             // Just put in uniform probabilities if we are normalizing all 0s
@@ -664,7 +665,7 @@ private void normalizeLogArr(double[] arr) {
         else {
             // Normalize in log-scale before exponentiation, to help with stability
             for (int i = 0; i < arr.length; i++) {
-                arr[i] = Math.exp(arr[i] - logSumExp);
+                arr[i] = FastMath.exp(arr[i] - logSumExp);
             }
         }
     }