Merge branch 'master' into mt-preordering-feat

stanfordnlp · Feb 18, 2015 · ec46b99 · ec46b99
1 parent f1e1e47
commit ec46b99
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 146 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java
@@ -161,12 +161,12 @@ public void testBasicMatching() throws Exception {
   }
 
   /**
-   * The LOCATION on Ontario Place should not be overwritten since Ontario (STATE_OR_PROVINCE)
-   * does not span Ontario Place.  Native American Church will overwrite ORGANIZATION with
+   * The LOCATION on Ontario Lake should not be overwritten since Ontario (STATE_OR_PROVINCE)
+   * does not span Ontario Lake.  Native American Church will overwrite ORGANIZATION with
    * RELIGION.
    */
   public void testOverwrite() throws Exception {
-    String str = "I like Ontario Place , and I like the Native American Church , too .";
+    String str = "I like Ontario Lake , and I like the Native American Church , too .";
     Annotation document = createDocument(str);
     annotator.annotate(document);
     List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

diff --git a/scripts/makeSerialized.csh b/scripts/makeSerialized.csh
@@ -40,7 +40,7 @@ set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed
 # [(1, 40), (901, 931), (1018, 1018), (1020, 1020), (1036, 1036), (1044, 1044), (1060, 1061), (1072, 1072), (1118, 1119), (1132, 1132), (1141, 1142), (1148, 1148), (2165, 2180), (2295, 2310), (2570, 2602), (2800, 2819)]
 set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg
 set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg
-set negra=/afs/ir/data/linguistic-data/NEGRA/penn-format-train-dev-test
+set negra=/u/nlp/data/GermanACL08/negra/penn-format-train-dev-test
 
 set host=`hostname | cut -d. -f1`
 

diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -1052,11 +1052,18 @@ private void classifyAndWriteAnswers(Collection<List<IN>> documents,
             IOUtils.encodedOutputStreamPrintWriter(System.out, flags.outputEncoding, true), readerWriter);
   }
 
+  /** Does nothing by default.  Children classes can override if necessary */
+  public void dumpFeatures(Collection<List<IN>> documents) {}
+
   public void classifyAndWriteAnswers(Collection<List<IN>> documents,
                                       PrintWriter printWriter,
                                       DocumentReaderAndWriter<IN> readerWriter)
     throws IOException
   {
+    if (flags.exportFeatures != null) {
+      dumpFeatures(documents);
+    }
+
     Timing timer = new Timing();
 
     Counter<String> entityTP = new ClassicCounter<String>();

diff --git a/src/edu/stanford/nlp/ie/crf/CRFClassifier.java b/src/edu/stanford/nlp/ie/crf/CRFClassifier.java
@@ -1101,6 +1101,18 @@ private double[] makeDatumUsingEmbedding(List<IN> info, int loc, List<FeatureFac
     return featureValArr;
   }
 
+  @Override
+  public void dumpFeatures(Collection<List<IN>> docs) {
+    if (flags.exportFeatures != null) {
+      Timing timer = new Timing();
+      timer.start();
+      CRFFeatureExporter<IN> featureExporter = new CRFFeatureExporter<IN>(this);
+      featureExporter.printFeatures(flags.exportFeatures, docs);
+      long elapsedMs = timer.stop();
+      System.err.println("Time to export features: " + Timing.toSecondsString(elapsedMs) + " seconds");
+    }
+  }
+
   @Override
   public List<IN> classify(List<IN> document) {
     if (flags.doGibbs) {
@@ -1599,11 +1611,7 @@ public void train(Collection<List<IN>> objectBankWrapper, DocumentReaderAndWrite
     }
 
     if (flags.exportFeatures != null) {
-      timer.start();
-      CRFFeatureExporter<IN> featureExporter = new CRFFeatureExporter<IN>(this);
-      featureExporter.printFeatures(flags.exportFeatures, docs);
-      elapsedMs = timer.stop();
-      System.err.println("Time to export features: " + Timing.toSecondsString(elapsedMs) + " seconds");
+      dumpFeatures(docs);
     }
 
     for (int i = 0; i <= flags.numTimesPruneFeatures; i++) {

diff --git a/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java b/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java
@@ -8,6 +8,7 @@
 
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
@@ -79,7 +80,9 @@ private String getFeatureString(List<IN> document) {
 
       List<List<String>> features = d.asFeatures();
       for (Collection<String> cliqueFeatures : features) {
-        for (String feat : cliqueFeatures) {
+        List<String> sortedFeatures = new ArrayList<String>(cliqueFeatures);
+        Collections.sort(sortedFeatures);
+        for (String feat : sortedFeatures) {
           feat = ubPrefixFeatureString(feat);
           sb.append(delimiter);
           sb.append(feat);

diff --git a/src/edu/stanford/nlp/sentiment/SentimentPipeline.java b/src/edu/stanford/nlp/sentiment/SentimentPipeline.java
@@ -38,8 +38,11 @@
  * <code>-parserModel</code> Which parser model to use, defaults to englishPCFG.ser.gz <br>
  * <code>-sentimentModel</code> Which sentiment model to use, defaults to sentiment.ser.gz <br>
  * <code>-file</code> Which file to process. <br>
+ * <code>-fileList</code> A comma separated list of files to process. <br>
  * <code>-stdin</code> Read one line at a time from stdin. <br>
- * <code>-output</code> pennTrees: Output trees with scores at each binarized node.  vectors: Number tree nodes and print out the vectors.  Defaults to printing just the root. <br>
+ * <code>-output</code> pennTrees: Output trees with scores at each binarized node.  vectors: Number tree nodes and print out the vectors.  probabilities: Output the scores for different labels for each node. Defaults to printing just the root. <br>
+ * <code>-filterUnknown</code> remove unknown trees from the input.  Only applies to TREES input, in which case the trees must be binarized with sentiment labels <br>
+ * <code>-help</code> Print out help <br>
  *
  * @author John Bauer
  */

diff --git a/src/edu/stanford/nlp/tagger/util/CountTagSequences.java b/src/edu/stanford/nlp/tagger/util/CountTagSequences.java