Skip to content

Commit

Permalink
Fix bug in CRFClassifier main; better document readAndWriter fields.
Browse files Browse the repository at this point in the history
  • Loading branch information
manning authored and Stanford NLP committed Aug 15, 2016
1 parent cb1d464 commit caa75ee
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 26 deletions.
48 changes: 33 additions & 15 deletions src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Expand Up @@ -103,11 +103,27 @@ public abstract class AbstractSequenceClassifier<IN extends CoreMap> implements
protected MaxSizeConcurrentHashSet<String> knownLCWords; // = null; protected MaxSizeConcurrentHashSet<String> knownLCWords; // = null;


private DocumentReaderAndWriter<IN> defaultReaderAndWriter; private DocumentReaderAndWriter<IN> defaultReaderAndWriter;

/** This is the DocumentReaderAndWriter used for reading training and testing files.
* It is the DocumentReaderAndWriter specified by the readerAndWriter flag and
* defaults to {@code edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter} which
* is suitable for reading CoNLL-style TSV files.
*
* @return The default DocumentReaderAndWriter
*/
public DocumentReaderAndWriter<IN> defaultReaderAndWriter() { public DocumentReaderAndWriter<IN> defaultReaderAndWriter() {
return defaultReaderAndWriter; return defaultReaderAndWriter;
} }


private DocumentReaderAndWriter<IN> plainTextReaderAndWriter; private DocumentReaderAndWriter<IN> plainTextReaderAndWriter;

/** This is the default DocumentReaderAndWriter used for reading text files for runtime
* classification. It is the DocumentReaderAndWriter specified by the plainTextDocumentReaderAndWriter
* flag and defaults to {@code edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter} which
* is suitable for reading plain text files, in languages with a Tokenizer available.
*
* @return The default plain text DocumentReaderAndWriter
*/
public DocumentReaderAndWriter<IN> plainTextReaderAndWriter() { public DocumentReaderAndWriter<IN> plainTextReaderAndWriter() {
return plainTextReaderAndWriter; return plainTextReaderAndWriter;
} }
Expand Down Expand Up @@ -1011,15 +1027,11 @@ public static void outputCalibrationInfo(PrintWriter pw,
pw.println("----------------------------------------"); pw.println("----------------------------------------");
} }


public void classifyStdin() public void classifyStdin() throws IOException {
throws IOException
{
classifyStdin(plainTextReaderAndWriter); classifyStdin(plainTextReaderAndWriter);
} }


public void classifyStdin(DocumentReaderAndWriter<IN> readerWriter) public void classifyStdin(DocumentReaderAndWriter<IN> readerWriter) throws IOException {
throws IOException
{
BufferedReader is = IOUtils.readerFromStdin(flags.inputEncoding); BufferedReader is = IOUtils.readerFromStdin(flags.inputEncoding);
for (String line; (line = is.readLine()) != null; ) { for (String line; (line = is.readLine()) != null; ) {
Collection<List<IN>> documents = makeObjectBankFromString(line, readerWriter); Collection<List<IN>> documents = makeObjectBankFromString(line, readerWriter);
Expand All @@ -1039,8 +1051,9 @@ public void dumpFeatures(Collection<List<IN>> documents) {}


/** /**
* Load a test file, run the classifier on it, and then print the answers to * Load a test file, run the classifier on it, and then print the answers to
* stdout (with timing to stderr). This uses the value of flags.documentReader * stdout (with timing to stderr). This uses the value of flags.readerAndWriter
* to determine testFile format. * to determine testFile format. <i>Note:</i> This means that it works right for
* a testFile and not a plain textFile.
* *
* @param testFile The file to test on. * @param testFile The file to test on.
*/ */
Expand All @@ -1065,8 +1078,7 @@ public Triple<Double,Double,Double> classifyAndWriteAnswers(String testFile, boo


/** /**
* Load a test file, run the classifier on it, and then print the answers to * Load a test file, run the classifier on it, and then print the answers to
* stdout (with timing to stderr). This uses the value of flags.documentReader * stdout (with timing to stderr).
* to determine testFile format.
* *
* @param testFile The file to test on. * @param testFile The file to test on.
* @param readerWriter A reader and writer to use for the output * @param readerWriter A reader and writer to use for the output
Expand Down Expand Up @@ -1102,9 +1114,15 @@ public Triple<Double,Double,Double> classifyAndWriteAnswers(String baseDir, Stri
return classifyAndWriteAnswers(documents, readerWriter, outputScores); return classifyAndWriteAnswers(documents, readerWriter, outputScores);
} }


public void classifyFilesAndWriteAnswers(Collection<File> testFiles) /** Run the classifier on a collection of text files.
* Uses the plainTextReaderAndWriter to process them.
*
* @param textFiles A File Collection to process.
* @throws IOException For any IO error
*/
public void classifyFilesAndWriteAnswers(Collection<File> textFiles)
throws IOException { throws IOException {
classifyFilesAndWriteAnswers(testFiles, plainTextReaderAndWriter, false); classifyFilesAndWriteAnswers(textFiles, plainTextReaderAndWriter, false);
} }


public void classifyFilesAndWriteAnswers(Collection<File> testFiles, public void classifyFilesAndWriteAnswers(Collection<File> testFiles,
Expand Down Expand Up @@ -1275,8 +1293,7 @@ public void classifyAndWriteAnswersKBest(ObjectBank<List<IN>> documents, int k,
*/ */
public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGraphPrefix, DocumentReaderAndWriter<IN> readerAndWriter) throws IOException { public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGraphPrefix, DocumentReaderAndWriter<IN> readerAndWriter) throws IOException {
Timing timer = new Timing(); Timing timer = new Timing();
ObjectBank<List<IN>> documents = ObjectBank<List<IN>> documents = makeObjectBankFromFile(testFile, readerAndWriter);
makeObjectBankFromFile(testFile, readerAndWriter);
int numWords = 0; int numWords = 0;
int numSentences = 0; int numSentences = 0;


Expand All @@ -1286,8 +1303,9 @@ public void classifyAndWriteViterbiSearchGraph(String testFile, String searchGra
PrintWriter latticeWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences PrintWriter latticeWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences
+ ".wlattice")); + ".wlattice"));
PrintWriter vsgWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences + ".lattice")); PrintWriter vsgWriter = new PrintWriter(new FileOutputStream(searchGraphPrefix + '.' + numSentences + ".lattice"));
if (readerAndWriter instanceof LatticeWriter) if (readerAndWriter instanceof LatticeWriter) {
((LatticeWriter<IN, String, Integer>) readerAndWriter).printLattice(tagLattice, doc, latticeWriter); ((LatticeWriter<IN, String, Integer>) readerAndWriter).printLattice(tagLattice, doc, latticeWriter);
}
tagLattice.printAttFsmFormat(vsgWriter); tagLattice.printAttFsmFormat(vsgWriter);
latticeWriter.close(); latticeWriter.close();
vsgWriter.close(); vsgWriter.close();
Expand Down
9 changes: 3 additions & 6 deletions src/edu/stanford/nlp/ie/crf/CRFClassifier.java
Expand Up @@ -3060,7 +3060,7 @@ public static void main(String[] args) throws Exception {
// todo: Change testFile to call testFiles with a singleton list // todo: Change testFile to call testFiles with a singleton list
DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter(); DocumentReaderAndWriter<CoreLabel> readerAndWriter = crf.defaultReaderAndWriter();
if (crf.flags.searchGraphPrefix != null) { if (crf.flags.searchGraphPrefix != null) {
crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, crf.makeReaderAndWriter()); crf.classifyAndWriteViterbiSearchGraph(testFile, crf.flags.searchGraphPrefix, readerAndWriter);
} else if (crf.flags.printFirstOrderProbs) { } else if (crf.flags.printFirstOrderProbs) {
crf.printFirstOrderProbs(testFile, readerAndWriter); crf.printFirstOrderProbs(testFile, readerAndWriter);
} else if (crf.flags.printFactorTable) { } else if (crf.flags.printFactorTable) {
Expand All @@ -3087,14 +3087,11 @@ public static void main(String[] args) throws Exception {
} }


if (textFile != null) { if (textFile != null) {
crf.classifyAndWriteAnswers(textFile); crf.classifyAndWriteAnswers(textFile, crf.plainTextReaderAndWriter(), false);
} }


if (textFiles != null) { if (textFiles != null) {
List<File> files = new ArrayList<>(); List<File> files = Arrays.stream(textFiles.split(",")).map(File::new).collect(Collectors.toList());
for (String filename : textFiles.split(",")) {
files.add(new File(filename));
}
crf.classifyFilesAndWriteAnswers(files); crf.classifyFilesAndWriteAnswers(files);
} }


Expand Down
11 changes: 6 additions & 5 deletions src/edu/stanford/nlp/process/AbstractTokenizer.java
@@ -1,11 +1,12 @@
package edu.stanford.nlp.process; package edu.stanford.nlp.process;
import edu.stanford.nlp.util.logging.Redwood;



import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;


// import edu.stanford.nlp.util.logging.Redwood;


/** /**
* An abstract tokenizer. Tokenizers extending AbstractTokenizer need only * An abstract tokenizer. Tokenizers extending AbstractTokenizer need only
* implement the {@code getNext()} method. This implementation does not * implement the {@code getNext()} method. This implementation does not
Expand All @@ -18,8 +19,8 @@


public abstract class AbstractTokenizer<T> implements Tokenizer<T> { public abstract class AbstractTokenizer<T> implements Tokenizer<T> {


/** A logger for this class */ // /** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AbstractTokenizer.class); // private static final Redwood.RedwoodChannels log = Redwood.channels(AbstractTokenizer.class);


protected T nextToken; // = null; protected T nextToken; // = null;


Expand Down

0 comments on commit caa75ee

Please sign in to comment.