Merge branch 'master' into gm-quote

stanfordnlp · Feb 16, 2015 · 931c483 · 931c483
1 parent 8820df2
commit 931c483
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 128 deletions.
diff --git a/JavaNLP-core.iml b/JavaNLP-core.iml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<module type="JAVA_MODULE" version="4">
+<module classpath="eclipse" classpath-dir="$MODULE_DIR$" type="JAVA_MODULE" version="4">
   <component name="EclipseModuleManager">
     <libelement value="jar://$MODULE_DIR$/lib/ant-contrib-1.0b3.jar!/" />
     <libelement value="jar://$MODULE_DIR$/lib/tomcat/el-api.jar!/" />
@@ -311,5 +311,4 @@
       </library>
     </orderEntry>
   </component>
-</module>
-
+</module>
diff --git a/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java b/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
@@ -36,30 +36,30 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
     CoreMap sent = doc.get(CoreAnnotations.SentencesAnnotation.class).get(0);
     assertTrue(sent.get(CoreAnnotations.TokensAnnotation.class) != null);
     List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
-    if(VERBOSE){
+    if (VERBOSE) {
       for(CoreLabel token: tokens) {
-        System.out.println("\t" + token.word() + " " + 
-            token.tag() + " " + 
-            token.ner() + " " + 
+        System.out.println('\t' + token.word() + ' ' +
+            token.tag() + ' ' +
+            token.ner() + ' ' +
             (token.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class) ? token.get(CoreAnnotations.NumericCompositeValueAnnotation.class) + " " : "") +
-            (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : "")); 
+            (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : ""));
       }
     }
-    
+
     // check NER labels
     assertTrue(tokens.size() == labels.length);
-    for(int i = 0; i < labels.length; i ++){
+    for (int i = 0; i < labels.length; i ++) {
       if(labels[i] == null){
         assertTrue(tokens.get(i).ner() == null);
       } else {
         Pattern p = Pattern.compile(labels[i]);
         System.err.println("COMPARING NER " + labels[i] + " with " + tokens.get(i).ner());
         System.err.flush();
-        assertTrue(tokens.get(i).ner() != null);
+        assertTrue("NER should not be null for token " + tokens.get(i) + " in sentence " + tokens, tokens.get(i).ner() != null);
         assertTrue(tokens.get(i).ner() + " does not match " + p + " for token " + tokens.get(i) + " in sentence " + tokens, p.matcher(tokens.get(i).ner()).matches());
       }
     }
-    
+
     // check normalized values, if gold is given
     if(normed != null){
       assertTrue(tokens.size() == normed.length);
@@ -70,8 +70,8 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
           Pattern p = Pattern.compile(normed[i]);
           String n = tokens.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
           String message = "COMPARING NORMED \"" + normed[i] + "\" with \"" + n + "\"";
-          assertTrue(message, n != null);
-          assertTrue(message, p.matcher(n).matches());
+          assertTrue(message + "; latter should not be null", n != null);
+          assertTrue(message + "; latter should match", p.matcher(n).matches());
         }
       }
     }
@@ -83,8 +83,8 @@ private static void run(String header, String [] texts, String [][] answers, Str
       if(VERBOSE) {
         System.out.println("Running test " + header + " for text: " + texts[i]);
       }
-      checkLabels(pipe, 
-          texts[i], 
+      checkLabels(pipe,
+          texts[i],
           answers[i],
           normed != null ? normed[i] : null);
     }
@@ -100,19 +100,21 @@ private static void run(String header, String [] texts, String [][] answers, Str
     "It cost four million dollars",
     "It cost $1m",
     "It cost 50 cents",
-    "It cost # 1500",
+    "It cost £ 1500",
     "It cost \u00A3 1500",
     "It cost \u00A3 .50",
-    "It cost # .50",
+    "It cost € .50",
     "It cost $ 1500",
     "It cost $1500",
     "It cost $ 1,500",
     "It cost $1,500",
     "It cost $48.75",
     "It cost $ 57.60",
     "It cost $8 thousand",
-    "It cost $42,33"
+    "It cost $42,33",
+//    "It cost ₩1500",  // TODO: Add won symbol to PTBTokenizer
   };
+
   private static final String [][] moneyAnswers = {
     { null, null, "MONEY", "MONEY" },
     { null, null, "MONEY", "MONEY" },
@@ -134,8 +136,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
     { null, null, "MONEY", "MONEY" },
     { null, null, "MONEY", "MONEY" },
     { null, null, "MONEY", "MONEY", "MONEY" },
-    { null, null, "MONEY", "MONEY" }
+    { null, null, "MONEY", "MONEY" },
+//    { null, null, "MONEY", "MONEY" },
   };
+
   private static final String [][] moneyNormed = {
     { null, null, "\\$5.0", "\\$5.0" },
     { null, null, "\\$0.24", "\\$0.24" },
@@ -149,16 +153,18 @@ private static void run(String header, String [] texts, String [][] answers, Str
     { null, null, "\u00A31500.0", "\u00A31500.0" },
     { null, null, "\u00A31500.0", "\u00A31500.0" },
     { null, null, "\u00A30.5", "\u00A30.5" },
-    { null, null, "\u00A30.5", "\u00A30.5" },
+    { null, null, "\\$0.5", "\\$0.5" },     // TODO: Fix PTBTokenizer to really normalize it to Euro €
     { null, null, "\\$1500.0", "\\$1500.0" },
     { null, null, "\\$1500.0", "\\$1500.0" },
     { null, null, "\\$1500.0", "\\$1500.0" },
     { null, null, "\\$1500.0", "\\$1500.0" },
     { null, null, "\\$48.75", "\\$48.75" },
     { null, null, "\\$57.6", "\\$57.6" },
     { null, null, "\\$8000.0", "\\$8000.0", "\\$8000.0" },
-    { null, null, "\\$4233.0", "\\$4233.0" }
+    { null, null, "\\$4233.0", "\\$4233.0" },
+//    { null, null, "₩4233.0", "₩4233.0" },
   };
+
   public void testMoney() {
     run("MONEY", moneyStrings, moneyAnswers, moneyNormed);
   }
@@ -185,7 +191,7 @@ public void testMoney() {
     { null, null, null, "1000.0", null },
   };
   public void testOrdinal() {
-    run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed); 
+    run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed);
   }
 
   private static final String [] dateStrings = {
@@ -243,7 +249,7 @@ public void testOrdinal() {
     { "2008-06-06" , "2008-06-06", "2008-06-06", null, "2008-06-07" , "2008-06-07", "2008-06-07" },
   };
   public void testDate() {
-    run("DATE", dateStrings, dateAnswers, dateNormed); 
+    run("DATE", dateStrings, dateAnswers, dateNormed);
   }
 
   private static final String [] numberStrings = {
@@ -280,9 +286,9 @@ public void testDate() {
     { "801.0", null, "123.0", null }
   };
   public void testNumber() {
-    run("NUMBER", numberStrings, numberAnswers, numberNormed); 
+    run("NUMBER", numberStrings, numberAnswers, numberNormed);
   }
-  
+
   private static final String [] timeStrings = {
     "the time was 10:20",
     "12:29 p.m.",

diff --git a/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java
@@ -13,7 +13,7 @@
 import java.util.Properties;
 
 
-/** 
+/**
  * @author Angel Chang
  * @author John Bauer
  */
@@ -23,9 +23,9 @@ public class NERCombinerAnnotatorITest extends TestCase {
   public static final String NER_7CLASS = DefaultPaths.DEFAULT_NER_MUC_MODEL;
   public static final String NER_MISCCLASS = DefaultPaths.DEFAULT_NER_CONLL_MODEL;
 
-  static NERCombinerAnnotator nerAnnotator = null;
-  static AnnotationPipeline unthreadedPipeline = null;
-  static AnnotationPipeline threaded4Pipeline = null;
+  private static NERCombinerAnnotator nerAnnotator = null;
+  private static AnnotationPipeline unthreadedPipeline = null;
+  private static AnnotationPipeline threaded4Pipeline = null;
 
   /**
    * Creates the tagger annotator if it isn't already created
@@ -42,7 +42,7 @@ public void setUp()
         props.setProperty("ner.applyNumericClassifiers", "false");
         props.setProperty("ner.useSUTime", "false");
         props.setProperty("ner.model", NER_3CLASS);
-        NERClassifierCombiner ner = NERCombinerAnnotator.createNERClassifierCombiner("ner", props);
+        NERClassifierCombiner ner = NERClassifierCombiner.createNERClassifierCombiner("ner", props);
         NERCombinerAnnotator threaded4Annotator = new NERCombinerAnnotator(ner, false, 4, -1);
 
         threaded4Pipeline = new AnnotationPipeline();
@@ -75,7 +75,7 @@ public void testThreadedAnnotator() {
     verifyAnswers(ANSWERS, document);
   }
 
-  public void verifyAnswers(String[][] expected, Annotation document) {
+  public static void verifyAnswers(String[][] expected, Annotation document) {
     int sentenceIndex = 0;
     for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
       List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
@@ -100,7 +100,7 @@ public void verifyAnswers(String[][] expected, Annotation document) {
 
   private static Iterator<Annotation> getTestData(String inputString, boolean includeAnswer)
   {
-    ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter();
+    ColumnTabDocumentReaderWriter<CoreMap> colReader = new ColumnTabDocumentReaderWriter<>();
     if (includeAnswer) {
       colReader.init("word=0,tag=1,answer=2");
     } else {

diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -108,8 +108,7 @@ public DocumentReaderAndWriter<IN> plainTextReaderAndWriter() {
    * Construct a SeqClassifierFlags object based on the passed in properties,
    * and then call the other constructor.
    *
-   * @param props
-   *          See SeqClassifierFlags for known properties.
+   * @param props See SeqClassifierFlags for known properties.
    */
   public AbstractSequenceClassifier(Properties props) {
     this(new SeqClassifierFlags(props));
@@ -124,14 +123,13 @@ public AbstractSequenceClassifier(Properties props) {
   public AbstractSequenceClassifier(SeqClassifierFlags flags) {
     this.flags = flags;
 
-    // try {
     // Thang Sep13: allow for multiple feature factories.
     this.featureFactories = Generics.newArrayList();
     if (flags.featureFactory != null) {
       FeatureFactory<IN> factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
       featureFactories.add(factory);
     }
-    if(flags.featureFactories!=null){
+    if (flags.featureFactories != null) {
       for (int i = 0; i < flags.featureFactories.length; i++) {
         FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]).
             createInstance(flags.featureFactoriesArgs.get(i));
@@ -142,11 +140,7 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) {
       tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
     } else {
       this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs);
-    //   this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance();
     }
-    // } catch (Exception e) {
-    //   throw new RuntimeException(e);
-    // }
     pad = tokenFactory.makeToken();
     windowSize = flags.maxLeft + 1;
     reinit();
@@ -281,7 +275,7 @@ public List<IN> classifySentence(List<? extends HasWord> sentence) {
       i++;
     }
 
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
     wrapper.processDocument(document);
 
@@ -320,7 +314,7 @@ public List<IN> classifySentenceWithGlobalInformation(List<? extends HasWord> to
       i++;
     }
 
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
     wrapper.processDocument(document);
 
@@ -402,7 +396,7 @@ public DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends
     if (doc.isEmpty()) {
       return new DFSA<String, Integer>(null);
     }
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     ObjectBankWrapper<IN> obw = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
     doc = obw.processDocument(doc);
     SequenceModel model = getSequenceModel(doc);
@@ -441,8 +435,7 @@ public List<List<IN>> classify(String str) {
    * Classify the tokens in a String. Each sentence becomes a separate document.
    * Doesn't override default readerAndWriter.
    *
-   * @param str
-   *          A String with tokens in one or more sentences of text to be
+   * @param str A String with tokens in one or more sentences of text to be
    *          classified.
    * @return {@link List} of classified sentences (each a List of something that
    *         extends {@link CoreMap}).
@@ -563,7 +556,7 @@ public String classifyToString(String sentences, String outputFormat, boolean pr
         plainTextReaderAndWriter.printAnswers(docOutput, pw);
         pw.flush();
         sb.append(sw.toString());
-        sb.append("\n");
+        sb.append('\n');
       }
     }
     return sb.toString();
@@ -791,8 +784,7 @@ public void train(String[] trainFileList,
    * Trains a classifier from a Collection of sequences.
    * Note that the Collection can be (and usually is) an ObjectBank.
    *
-   * @param docs
-   *          An Objectbank or a collection of sequences of IN
+   * @param docs An ObjectBank or a collection of sequences of IN
    */
   public void train(Collection<List<IN>> docs) {
     train(docs, defaultReaderAndWriter);
@@ -802,10 +794,8 @@ public void train(Collection<List<IN>> docs) {
    * Trains a classifier from a Collection of sequences.
    * Note that the Collection can be (and usually is) an ObjectBank.
    *
-   * @param docs
-   *          An ObjectBank or a collection of sequences of IN
-   * @param readerAndWriter
-   *          A DocumentReaderAndWriter to use when loading test files
+   * @param docs An ObjectBank or a collection of sequences of IN
+   * @param readerAndWriter A DocumentReaderAndWriter to use when loading test files
    */
   public abstract void train(Collection<List<IN>> docs,
                              DocumentReaderAndWriter<IN> readerAndWriter);
@@ -858,9 +848,8 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(String[] trainFileList,
       File f = new File(trainFile);
       files.add(f);
     }
-    // System.err.printf("trainFileList contains %d file%s.\n", files.size(),
-    // files.size() == 1 ? "": "s");
-    // TODO get rid of objectbankwrapper
+    // System.err.printf("trainFileList contains %d file%s in encoding %s.%n", files.size(), files.size() == 1 ? "": "s", flags.inputEncoding);
+    // TODO get rid of ObjectBankWrapper
     // return new ObjectBank<List<IN>>(new
     // ResettableReaderIteratorFactory(files), readerAndWriter);
     return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files, flags.inputEncoding),
@@ -892,7 +881,7 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(String baseDir, String fileP
     // return new ObjectBank<List<IN>>(new
     // ResettableReaderIteratorFactory(files, flags.inputEncoding),
     // readerAndWriter);
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files,
         flags.inputEncoding), readerAndWriter), knownLCWords);
   }
@@ -905,7 +894,7 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(Collection<File> files,
     // return new ObjectBank<List<IN>>(new
     // ResettableReaderIteratorFactory(files, flags.inputEncoding),
     // readerAndWriter);
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files,
         flags.inputEncoding), readerAndWriter), knownLCWords);
   }
@@ -929,7 +918,7 @@ public ObjectBank<List<IN>> makeObjectBankFromReader(BufferedReader in,
     if (flags.announceObjectBankEntries) {
       System.err.println("Reading data using " + readerAndWriter.getClass());
     }
-    // TODO get rid of objectbankwrapper
+    // TODO get rid of ObjectBankWrapper
     // return new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(in),
     // readerAndWriter);
     return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(in),
@@ -956,8 +945,7 @@ public void printProbs(String filename,
    * Takes a {@link List} of documents and prints the likelihood of each
    * possible label at each point.
    *
-   * @param documents
-   *          A {@link List} of {@link List} of something that extends
+   * @param documents A {@link List} of {@link List} of something that extends
    *          {@link CoreMap}.
    */
   public void printProbsDocuments(ObjectBank<List<IN>> documents) {
@@ -1076,9 +1064,9 @@ public void classifyAndWriteAnswers(Collection<List<IN>> documents,
 
     Timing timer = new Timing();
 
-    Counter<String> entityTP = new ClassicCounter<String>();
-    Counter<String> entityFP = new ClassicCounter<String>();
-    Counter<String> entityFN = new ClassicCounter<String>();
+    Counter<String> entityTP = new ClassicCounter<>();
+    Counter<String> entityFP = new ClassicCounter<>();
+    Counter<String> entityFN = new ClassicCounter<>();
     boolean resultsCounted = outputScores;
     int numWords = 0;
     int numDocs = 0;
@@ -1150,7 +1138,7 @@ public ThreadsafeProcessor<List<IN>, List<IN>> newInstance() {
    *
    * @param testFile The name of the file to test on.
    * @param k How many best to print
-   * @param readerAndWriter
+   * @param readerAndWriter Class to be used for printing answers
    */
   public void classifyAndWriteAnswersKBest(String testFile, int k,
                                        DocumentReaderAndWriter<IN> readerAndWriter)