diff --git a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java index 635f4b34e2..69191d9e6f 100644 --- a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java @@ -161,12 +161,12 @@ public void testBasicMatching() throws Exception { } /** - * The LOCATION on Ontario Place should not be overwritten since Ontario (STATE_OR_PROVINCE) - * does not span Ontario Place. Native American Church will overwrite ORGANIZATION with + * The LOCATION on Ontario Lake should not be overwritten since Ontario (STATE_OR_PROVINCE) + * does not span Ontario Lake. Native American Church will overwrite ORGANIZATION with * RELIGION. */ public void testOverwrite() throws Exception { - String str = "I like Ontario Place , and I like the Native American Church , too ."; + String str = "I like Ontario Lake , and I like the Native American Church , too ."; Annotation document = createDocument(str); annotator.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); diff --git a/scripts/makeSerialized.csh b/scripts/makeSerialized.csh index 97f18ac419..5ddef67777 100755 --- a/scripts/makeSerialized.csh +++ b/scripts/makeSerialized.csh @@ -40,7 +40,7 @@ set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed # [(1, 40), (901, 931), (1018, 1018), (1020, 1020), (1036, 1036), (1044, 1044), (1060, 1061), (1072, 1072), (1118, 1119), (1132, 1132), (1141, 1142), (1148, 1148), (2165, 2180), (2295, 2310), (2570, 2602), (2800, 2819)] set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg -set negra=/afs/ir/data/linguistic-data/NEGRA/penn-format-train-dev-test +set negra=/u/nlp/data/GermanACL08/negra/penn-format-train-dev-test set host=`hostname | cut -d. -f1` diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java index 5ebd403873..13f10096e2 100644 --- a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java +++ b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java @@ -1052,11 +1052,18 @@ private void classifyAndWriteAnswers(Collection> documents, IOUtils.encodedOutputStreamPrintWriter(System.out, flags.outputEncoding, true), readerWriter); } + /** Does nothing by default. Children classes can override if necessary */ + public void dumpFeatures(Collection> documents) {} + public void classifyAndWriteAnswers(Collection> documents, PrintWriter printWriter, DocumentReaderAndWriter readerWriter) throws IOException { + if (flags.exportFeatures != null) { + dumpFeatures(documents); + } + Timing timer = new Timing(); Counter entityTP = new ClassicCounter(); diff --git a/src/edu/stanford/nlp/ie/crf/CRFClassifier.java b/src/edu/stanford/nlp/ie/crf/CRFClassifier.java index be4b834184..50c5a3fe86 100644 --- a/src/edu/stanford/nlp/ie/crf/CRFClassifier.java +++ b/src/edu/stanford/nlp/ie/crf/CRFClassifier.java @@ -1101,6 +1101,18 @@ private double[] makeDatumUsingEmbedding(List info, int loc, List> docs) { + if (flags.exportFeatures != null) { + Timing timer = new Timing(); + timer.start(); + CRFFeatureExporter featureExporter = new CRFFeatureExporter(this); + featureExporter.printFeatures(flags.exportFeatures, docs); + long elapsedMs = timer.stop(); + System.err.println("Time to export features: " + Timing.toSecondsString(elapsedMs) + " seconds"); + } + } + @Override public List classify(List document) { if (flags.doGibbs) { @@ -1599,11 +1611,7 @@ public void train(Collection> objectBankWrapper, DocumentReaderAndWrite } if (flags.exportFeatures != null) { - timer.start(); - CRFFeatureExporter featureExporter = new CRFFeatureExporter(this); - featureExporter.printFeatures(flags.exportFeatures, docs); - elapsedMs = timer.stop(); - System.err.println("Time to export features: " + Timing.toSecondsString(elapsedMs) + " seconds"); + dumpFeatures(docs); } for (int i = 0; i <= flags.numTimesPruneFeatures; i++) { diff --git a/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java b/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java index 9b6c2cecc8..5f068a46cb 100644 --- a/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java +++ b/src/edu/stanford/nlp/ie/crf/CRFFeatureExporter.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -79,7 +80,9 @@ private String getFeatureString(List document) { List> features = d.asFeatures(); for (Collection cliqueFeatures : features) { - for (String feat : cliqueFeatures) { + List sortedFeatures = new ArrayList(cliqueFeatures); + Collections.sort(sortedFeatures); + for (String feat : sortedFeatures) { feat = ubPrefixFeatureString(feat); sb.append(delimiter); sb.append(feat); diff --git a/src/edu/stanford/nlp/sentiment/SentimentPipeline.java b/src/edu/stanford/nlp/sentiment/SentimentPipeline.java index 26a00311eb..b3386edd5e 100644 --- a/src/edu/stanford/nlp/sentiment/SentimentPipeline.java +++ b/src/edu/stanford/nlp/sentiment/SentimentPipeline.java @@ -38,8 +38,11 @@ * -parserModel Which parser model to use, defaults to englishPCFG.ser.gz
* -sentimentModel Which sentiment model to use, defaults to sentiment.ser.gz
* -file Which file to process.
+ * -fileList A comma separated list of files to process.
* -stdin Read one line at a time from stdin.
- * -output pennTrees: Output trees with scores at each binarized node. vectors: Number tree nodes and print out the vectors. Defaults to printing just the root.
+ * -output pennTrees: Output trees with scores at each binarized node. vectors: Number tree nodes and print out the vectors. probabilities: Output the scores for different labels for each node. Defaults to printing just the root.
+ * -filterUnknown remove unknown trees from the input. Only applies to TREES input, in which case the trees must be binarized with sentiment labels
+ * -help Print out help
* * @author John Bauer */ diff --git a/src/edu/stanford/nlp/tagger/util/CountTagSequences.java b/src/edu/stanford/nlp/tagger/util/CountTagSequences.java deleted file mode 100644 index 3a41b293d0..0000000000 --- a/src/edu/stanford/nlp/tagger/util/CountTagSequences.java +++ /dev/null @@ -1,135 +0,0 @@ -package edu.stanford.nlp.tagger.util; - -import edu.stanford.nlp.ling.Label; -import edu.stanford.nlp.ling.Tag; -import edu.stanford.nlp.trees.*; -import edu.stanford.nlp.util.HashIndex; -import edu.stanford.nlp.util.Index; -import edu.stanford.nlp.util.StringUtils; - -import java.io.Reader; -import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -/** - * This class makes a table of tag bigram counts and percentages in data. - * It counts the bigrams (including sentence ends) prints out both a count - * matrix - * and percentages. The only clever part was learning how to format floating - * point numbers in Java. - * - * @author Christopher Manning - * @version 1.0 - */ -public final class CountTagSequences { - - private static final int MAXTAGS = 50; - private static final int FIELDLENG = 7; - - private static Index