Merge branch 'master' of jamie.stanford.edu:/u/nlp/git/javanlp

stanfordnlp · Jul 9, 2016 · 04d7b8d · 04d7b8d
1 parent ca831b8
commit 04d7b8d
Show file tree

Hide file tree

Showing 30 changed files with 84,917 additions and 64,263 deletions.
diff --git a/itest/src/edu/stanford/nlp/dcoref/DcorefBenchmarkSlowITest.java b/itest/src/edu/stanford/nlp/dcoref/DcorefBenchmarkSlowITest.java
@@ -98,40 +98,40 @@ public void testDcoref() throws Exception {
     expectedResults.setCount(MENTION_F1, 50.42);
     highResults.setCount(MENTION_F1, 50.45);
 
-    lowResults.setCount(MUC_TP, 6250);
-    expectedResults.setCount(MUC_TP, 6253);
-    highResults.setCount(MUC_TP, 6260);
+    lowResults.setCount(MUC_TP, 6245);
+    expectedResults.setCount(MUC_TP, 6250);
+    highResults.setCount(MUC_TP, 6255);
     lowResults.setCount(MUC_F1, 60.65);
-    expectedResults.setCount(MUC_F1, 60.67);
+    expectedResults.setCount(MUC_F1, 60.66);
     highResults.setCount(MUC_F1, 60.7);
 
-    lowResults.setCount(BCUBED_TP, 12450);
-    expectedResults.setCount(BCUBED_TP, 12457.63);
-    highResults.setCount(BCUBED_TP, 12460);
-    lowResults.setCount(BCUBED_F1, 70.8);
-    expectedResults.setCount(BCUBED_F1, 70.81);
+    lowResults.setCount(BCUBED_TP, 12440);
+    expectedResults.setCount(BCUBED_TP, 12445.8);
+    highResults.setCount(BCUBED_TP, 12450);
+    lowResults.setCount(BCUBED_F1, 70.75);
+    expectedResults.setCount(BCUBED_F1, 70.80);
     highResults.setCount(BCUBED_F1, 70.85);
 
-    lowResults.setCount(CEAFM_TP, 10920);
-    expectedResults.setCount(CEAFM_TP, 10927);
+    lowResults.setCount(CEAFM_TP, 10915);
+    expectedResults.setCount(CEAFM_TP, 10920);
     highResults.setCount(CEAFM_TP, 10930);
     lowResults.setCount(CEAFM_F1, 59.4);
-    expectedResults.setCount(CEAFM_F1, 59.44);
+    expectedResults.setCount(CEAFM_F1, 59.42);
     highResults.setCount(CEAFM_F1, 59.5);
 
     lowResults.setCount(CEAFE_TP, 3830);
-    expectedResults.setCount(CEAFE_TP, 3833.81);
+    expectedResults.setCount(CEAFE_TP, 3831.36);
     highResults.setCount(CEAFE_TP, 3840);
     lowResults.setCount(CEAFE_F1, 47.4);
-    expectedResults.setCount(CEAFE_F1, 47.46);
+    expectedResults.setCount(CEAFE_F1, 47.45);
     highResults.setCount(CEAFE_F1, 47.5);
 
     lowResults.setCount(BLANC_F1, 75.35);
-    expectedResults.setCount(BLANC_F1, 75.39);
+    expectedResults.setCount(BLANC_F1, 75.38);
     highResults.setCount(BLANC_F1, 75.42);
 
     lowResults.setCount(CONLL_SCORE, 59.6);
-    expectedResults.setCount(CONLL_SCORE, 59.65);
+    expectedResults.setCount(CONLL_SCORE, 59.64);
     highResults.setCount(CONLL_SCORE, 59.7);
 
     Counter<String> results = new ClassicCounter<String>();

diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -657,20 +657,21 @@ public List<Triple<String, Integer, Integer>> classifyToCharacterOffsets(String
   }
 
   /**
-   * ONLY USE IF LOADED A CHINESE WORD SEGMENTER!!!!!
+   * Have a word segmenter segment a String into a list of words.
+   * ONLY USE IF YOU LOADED A CHINESE WORD SEGMENTER!!!!!
    *
-   * @param sentence
-   *          The string to be classified
+   * @param sentence The string to be classified
    * @return List of words
    */
+  // This method is currently [2016] only called in a very small number of places:
+  // the parser's jsp webapp, ChineseSegmenterAnnotator, and SegDemo.
+  // Maybe we could eliminate it?
   public List<String> segmentString(String sentence) {
     return segmentString(sentence, defaultReaderAndWriter);
   }
 
-  public List<String> segmentString(String sentence,
-                                    DocumentReaderAndWriter<IN> readerAndWriter) {
-    ObjectBank<List<IN>> docs = makeObjectBankFromString(sentence,
-                                                         readerAndWriter);
+  public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> readerAndWriter) {
+    ObjectBank<List<IN>> docs = makeObjectBankFromString(sentence, readerAndWriter);
 
     StringWriter stringWriter = new StringWriter();
     PrintWriter stringPrintWriter = new PrintWriter(stringWriter);
@@ -685,7 +686,7 @@ public List<String> segmentString(String sentence,
     return Arrays.asList(segmented.split("\\s"));
   }
 
-  /**
+  /*
    * Classify the contents of {@link SeqClassifierFlags scf.testFile}. The file
    * should be in the format expected based on {@link SeqClassifierFlags
    * scf.documentReader}.
@@ -707,7 +708,10 @@ public List<String> segmentString(String sentence,
    * @return The same {@link List}, but with the elements annotated with their
    *         answers (stored under the
    *         {@link edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation}
-   *         key).
+   *         key). The answers will be the class labels defined by the CRF
+   *         Classifier. They might be things like entity labels (in BIO
+   *         notation or not) or something like "1" vs. "0" on whether to
+   *         begin a new token here or not (in word segmentation).
    */
   public abstract List<IN> classify(List<IN> document);
 

diff --git a/src/edu/stanford/nlp/ie/crf/CRFClassifier.java b/src/edu/stanford/nlp/ie/crf/CRFClassifier.java
@@ -960,7 +960,7 @@ protected static Index<CRFLabel> allLabels(int window, Index<String> classIndex)
    * Makes a CRFDatum by producing features and a label from input data at a
    * specific position, using the provided factory.
    *
-   * @param info The input data
+   * @param info The input data. Particular feature factories might look for arbitrary keys in the IN items.
    * @param loc The position to build a datum at
    * @param featureFactories The FeatureFactories to use to extract features
    * @return The constructed CRFDatum