Fix bug in CRFClassifier main; better document readAndWriter fields.

stanfordnlp · Aug 15, 2016 · caa75ee · caa75ee
1 parent cb1d464
commit caa75ee
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 26 deletions.
diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -103,11 +103,27 @@ public abstract class AbstractSequenceClassifier<IN extends CoreMap> implements
   protected MaxSizeConcurrentHashSet<String> knownLCWords; // = null;
 
   private DocumentReaderAndWriter<IN> defaultReaderAndWriter;
+
+  /** This is the DocumentReaderAndWriter used for reading training and testing files.
+   *  It is the DocumentReaderAndWriter specified by the readerAndWriter flag and
+   *  defaults to {@code edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter} which
+   *  is suitable for reading CoNLL-style TSV files.
+   *
+   *  @return The default DocumentReaderAndWriter
+   */
   public DocumentReaderAndWriter<IN> defaultReaderAndWriter() {
     return defaultReaderAndWriter;
   }
 
   private DocumentReaderAndWriter<IN> plainTextReaderAndWriter;
+
+  /** This is the default DocumentReaderAndWriter used for reading text files for runtime
+   *  classification. It is the DocumentReaderAndWriter specified by the plainTextDocumentReaderAndWriter
+   *  flag and defaults to {@code edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter} which
+   *  is suitable for reading plain text files, in languages with a Tokenizer available.
+   *
+   *  @return The default plain text DocumentReaderAndWriter
+   */
   public DocumentReaderAndWriter<IN> plainTextReaderAndWriter() {
     return plainTextReaderAndWriter;
   }
@@ -1011,15 +1027,11 @@ public static void outputCalibrationInfo(PrintWriter pw,
     pw.println("----------------------------------------");
   }
 
-  public void classifyStdin()
+  public void classifyStdin() throws IOException {
-    throws IOException
-  {
     classifyStdin(plainTextReaderAndWriter);
   }
 
-  public void classifyStdin(DocumentReaderAndWriter<IN> readerWriter)
+  public void classifyStdin(DocumentReaderAndWriter<IN> readerWriter) throws IOException {
-    throws IOException
-  {
     BufferedReader is = IOUtils.readerFromStdin(flags.inputEncoding);
     for (String line; (line = is.readLine()) != null; ) {
       Collection<List<IN>> documents = makeObjectBankFromString(line, readerWriter);
@@ -1039,8 +1051,9 @@ public void dumpFeatures(Collection<List<IN>> documents) {}
 
   /**
    * Load a test file, run the classifier on it, and then print the answers to
-   * stdout (with timing to stderr). This uses the value of flags.documentReader
+   * stdout (with timing to stderr). This uses the value of flags.readerAndWriter
-   * to determine testFile format.
+   * to determine testFile format. <i>Note:</i> This means that it works right for
+   * a testFile and not a plain textFile.
    *
    * @param testFile The file to test on.
    */
@@ -1065,8 +1078,7 @@ public Triple<Double,Double,Double> classifyAndWriteAnswers(String testFile, boo
 
   /**
    * Load a test file, run the classifier on it, and then print the answers to
-   * stdout (with timing to stderr). This uses the value of flags.documentReader
+   * stdout (with timing to stderr).
-   * to determine testFile format.
    *
    * @param testFile The file to test on.
    * @param readerWriter A reader and writer to use for the output
@@ -1102,9 +1114,15 @@ public Triple<Double,Double,Double> classifyAndWriteAnswers(String baseDir, Stri
     return classifyAndWriteAnswers(documents, readerWriter, outputScores);
   }
 
-  public void classifyFilesAndWriteAnswers(Collection<File> testFiles)
+  /** Run the classifier on a collection of text files.
+   *  Uses the plainTextReaderAndWriter to process them.
+   *
+   *  @param textFiles A File Collection to process.
+   *  @throws IOException For any IO error
+   */
+  public void classifyFilesAndWriteAnswers(Collection<File> textFiles)
           throws IOException {
-    classifyFilesAndWriteAnswers(testFiles, plainTextReaderAndWriter, false);
+    classifyFilesAndWriteAnswers(textFiles, plainTextReaderAndWriter, false);
   }
 
   public void classifyFilesAndWriteAnswers(Collection<File> testFiles,
@@ -1275,8 +1293,7 @@ public void classifyAndWriteAnswersKBest(ObjectBank<List<IN>> documents, int k,
    */
   public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGraphPrefix, DocumentReaderAndWriter<IN> readerAndWriter) throws IOException {
     Timing timer = new Timing();
-    ObjectBank<List<IN>> documents =
+    ObjectBank<List<IN>> documents = makeObjectBankFromFile(testFile, readerAndWriter);
-      makeObjectBankFromFile(testFile, readerAndWriter);
     int numWords = 0;
     int numSentences = 0;
 
@@ -1286,8 +1303,9 @@ public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGra
       PrintWriter latticeWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences
           + ".wlattice"));
       PrintWriter vsgWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences + ".lattice"));
-      if (readerAndWriter instanceof LatticeWriter)
+      if (readerAndWriter instanceof LatticeWriter) {
         ((LatticeWriter<IN, String, Integer>) readerAndWriter).printLattice(tagLattice, doc, latticeWriter);
+      }
       tagLattice.printAttFsmFormat(vsgWriter);
       latticeWriter.close();
       vsgWriter.close();

diff --git a/src/edu/stanford/nlp/ie/crf/CRFClassifier.java b/src/edu/stanford/nlp/ie/crf/CRFClassifier.java
@@ -3060,7 +3060,7 @@ public static void main(String[] args) throws Exception {
       // todo: Change testFile to call testFiles with a singleton list
       DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter();
       if (crf.flags.searchGraphPrefix != null) {
-        crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, crf.makeReaderAndWriter());
+        crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, readerAndWriter);
       } else if (crf.flags.printFirstOrderProbs) {
         crf.printFirstOrderProbs(testFile, readerAndWriter);
       } else if (crf.flags.printFactorTable) {
@@ -3087,14 +3087,11 @@ public static void main(String[] args) throws Exception {
     }
 
     if (textFile != null) {
-      crf.classifyAndWriteAnswers(textFile);
+      crf.classifyAndWriteAnswers(textFile, crf.plainTextReaderAndWriter(), false);
     }
 
     if (textFiles != null) {
-      List<File> files = new ArrayList<>();
+      List<File> files = Arrays.stream(textFiles.split(",")).map(File::new).collect(Collectors.toList());
-      for (String filename : textFiles.split(",")) {
-        files.add(new File(filename));
-      }
       crf.classifyFilesAndWriteAnswers(files);
     }
 

diff --git a/src/edu/stanford/nlp/process/AbstractTokenizer.java b/src/edu/stanford/nlp/process/AbstractTokenizer.java
@@ -1,11 +1,12 @@
-package edu.stanford.nlp.process; 
+package edu.stanford.nlp.process;
-import edu.stanford.nlp.util.logging.Redwood;
-
 
 import java.util.ArrayList;
 import java.util.List;
 import java.util.NoSuchElementException;
 
+// import edu.stanford.nlp.util.logging.Redwood;
+
+
 /**
  * An abstract tokenizer.  Tokenizers extending AbstractTokenizer need only
  * implement the {@code getNext()} method. This implementation does not
@@ -18,8 +19,8 @@
 
 public abstract class AbstractTokenizer<T> implements Tokenizer<T>  {
 
-  /** A logger for this class */
+  // /** A logger for this class */
-  private static Redwood.RedwoodChannels log = Redwood.channels(AbstractTokenizer.class);
+  // private static final Redwood.RedwoodChannels log = Redwood.channels(AbstractTokenizer.class);
 
   protected T nextToken; // = null;