Merge branch 'master' of jamie:/u/nlp/git/javanlp

stanfordnlp · Jul 26, 2016 · 7b26f83 · 7b26f83
1 parent 7b7509a
commit 7b26f83
Show file tree

Hide file tree

Showing 13 changed files with 229 additions and 137 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/PipelineITest.java b/itest/src/edu/stanford/nlp/pipeline/PipelineITest.java
@@ -2,7 +2,7 @@
 
 import java.util.List;
 
-import junit.framework.Assert;
+import org.junit.Assert;
 import junit.framework.TestCase;
 
 import edu.stanford.nlp.ling.CoreAnnotations;
@@ -33,25 +33,25 @@ public void testPipeline() throws Exception {
     Annotation document = new Annotation(text);
     Assert.assertEquals(text, document.toString());
     Assert.assertEquals(text, document.get(CoreAnnotations.TextAnnotation.class));
-    
+
     // annotate text with pipeline
     pipeline.annotate(document);
-    
+
     // demonstrate typical usage
     for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
-      
+
       // get the tree for the sentence
       Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
 
       // get the tokens for the sentence and iterate over them
       for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {
-        
+
         // get token attributes
         String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
         String tokenPOS = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
         String tokenLemma = token.get(CoreAnnotations.LemmaAnnotation.class);
         String tokenNE = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
-        
+
         // text, pos, lemma and name entity tag should be defined
         Assert.assertNotNull(tokenText);
         Assert.assertNotNull(tokenPOS);
@@ -83,7 +83,7 @@ public void testPipeline() throws Exception {
     List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
     Assert.assertNotNull(sentences);
     Assert.assertEquals(2, sentences.size());
-    
+
     // sentence 1
     String text1 = "Dan Ramage is working for\nMicrosoft.";
     CoreMap sentence1 = sentences.get(0);
@@ -93,12 +93,12 @@ public void testPipeline() throws Exception {
     Assert.assertEquals(36, (int)sentence1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
     Assert.assertEquals(0, (int)sentence1.get(CoreAnnotations.TokenBeginAnnotation.class));
     Assert.assertEquals(7, (int)sentence1.get(CoreAnnotations.TokenEndAnnotation.class));
-    
+
     // sentence 1 tree
     Tree tree1 = Tree.valueOf("(ROOT (S (NP (NNP Dan) (NNP Ramage)) (VP (VBZ is) " +
         "(VP (VBG working) (PP (IN for) (NP (NNP Microsoft))))) (. .)))");
     Assert.assertEquals(tree1, sentence1.get(TreeCoreAnnotations.TreeAnnotation.class));
-    
+
     // sentence 1 tokens
     String tokenText1 = "Dan Ramage is working for Microsoft .";
     List<CoreLabel> tokens1 = sentence1.get(CoreAnnotations.TokensAnnotation.class);
@@ -112,7 +112,7 @@ public void testPipeline() throws Exception {
     Assert.assertEquals("work", tokens1.get(3).get(CoreAnnotations.LemmaAnnotation.class));
     Assert.assertEquals(".", tokens1.get(6).get(CoreAnnotations.LemmaAnnotation.class));
     Assert.assertEquals("ORGANIZATION", tokens1.get(5).get(CoreAnnotations.NamedEntityTagAnnotation.class));
-    
+
     // sentence 2
     String text2 = "He's in Seattle!";
     CoreMap sentence2 = sentences.get(1);
@@ -127,7 +127,7 @@ public void testPipeline() throws Exception {
     Tree tree2 = Tree.valueOf("(ROOT (S (NP (PRP He)) (VP (VBZ 's) (PP (IN in) " +
         "(NP (NNP Seattle)))) (. !)))");
     Assert.assertEquals(tree2, sentence2.get(TreeCoreAnnotations.TreeAnnotation.class));
-    
+
     // sentence 2 tokens
     String tokenText2 = "He 's in Seattle !";
     List<CoreLabel> tokens2 = sentence2.get(CoreAnnotations.TokensAnnotation.class);
@@ -140,7 +140,7 @@ public void testPipeline() throws Exception {
     Assert.assertEquals("be", tokens2.get(1).get(CoreAnnotations.LemmaAnnotation.class));
     Assert.assertEquals("LOCATION", tokens2.get(3).get(CoreAnnotations.NamedEntityTagAnnotation.class));
   }
-  
+
   private static String join(List<CoreLabel> tokens) {
     return StringUtils.join(Iterables.transform(tokens, new Function<CoreLabel, String>() {
       public String apply(CoreLabel token) {

diff --git a/src/edu/stanford/nlp/ling/tokensregex/demo/TokensRegexDemo.java b/src/edu/stanford/nlp/ling/tokensregex/demo/TokensRegexDemo.java
@@ -9,44 +9,41 @@
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.PropertiesUtils;
 
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.List;
 
 /**
- * Demo illustrating how to use CoreMapExtractor
+ * Demo illustrating how to use CoreMapExtractor.
  */
 public class TokensRegexDemo {
 
   public static void main(String[] args) throws IOException {
-    PrintWriter out;
-
     String rules;
     if (args.length > 0) {
       rules = args[0];
     } else {
       rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt";
     }
+    PrintWriter out;
     if (args.length > 2) {
       out = new PrintWriter(args[2]);
     } else {
       out = new PrintWriter(System.out);
     }
 
-    CoreMapExpressionExtractor extractor = CoreMapExpressionExtractor
+    CoreMapExpressionExtractor<MatchedExpression> extractor = CoreMapExpressionExtractor
-      .createExtractorFromFiles(
+            .createExtractorFromFiles(TokenSequencePattern.getNewEnv(), rules);
-        TokenSequencePattern.getNewEnv(),
-        rules);
 
-    StanfordCoreNLP pipeline = new StanfordCoreNLP();
+    StanfordCoreNLP pipeline = new StanfordCoreNLP(
+            PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
     Annotation annotation;
     if (args.length > 1) {
       annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[1]));
     } else {
-//      annotation = new Annotation("I know Fred has acne.  And Wilma has breast cancer.");
       annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three");
-
     }
 
     pipeline.annotate(annotation);
@@ -60,19 +57,18 @@ public static void main(String[] args) throws IOException {
     List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
 
     for (CoreMap sentence : sentences) {
-      List<MatchedExpression> matchedExpressions = extractor
+      List<MatchedExpression> matchedExpressions = extractor.extractExpressions(sentence);
-        .extractExpressions(sentence);
       for (MatchedExpression matched:matchedExpressions) {
         // Print out matched text and value
-        out.println("matched: " + matched.getText() + " with value " + matched.getValue());
+        out.println("Matched expression: " + matched.getText() + " with value " + matched.getValue());
         // Print out token information
         CoreMap cm = matched.getAnnotation();
         for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) {
           String word = token.get(CoreAnnotations.TextAnnotation.class);
           String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
           String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
           String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
-          out.println("matched token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne);
+          out.println("  Matched token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne);
         }
       }
     }

diff --git a/src/edu/stanford/nlp/objectbank/IteratorFromReaderFactory.java b/src/edu/stanford/nlp/objectbank/IteratorFromReaderFactory.java
@@ -1,15 +1,20 @@
 package edu.stanford.nlp.objectbank;
 
+import java.io.Serializable;
 import java.util.Iterator;
 
 /**
  * An IteratorFromReaderFactory is used to convert a java.io.Reader
  * into an Iterator over the Objects of type T represented by the text
  * in the java.io.Reader.
  *
+ * (We have it be Serializable just to avoid non-serializable warnings;
+ * since implementations of this class normally have no state, they
+ * should be trivially serializable.)
+ *
  * @author Jenny Finkel
  */
-public interface IteratorFromReaderFactory<T> {
+public interface IteratorFromReaderFactory<T> extends Serializable {
 
   /** Return an iterator over the contents read from r.
    *

diff --git a/src/edu/stanford/nlp/process/TokenizerFactory.java b/src/edu/stanford/nlp/process/TokenizerFactory.java
@@ -5,10 +5,8 @@
 import java.io.Reader;
 
 /**
- * A TokenizerFactory is used to convert a java.io.Reader into a Tokenizer
+ * A TokenizerFactory is a factory that can build a Tokenizer (an extension of Iterator)
- * (an extension of Iterator) over objects of type T represented by the text
+ * from a java.io.Reader.
- * in the java.io.Reader.  It's mainly a convenience, since you could cast
- * down anyway.
  *
  * <i>IMPORTANT NOTE:</i><br/>
  *

diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java b/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java
@@ -38,7 +38,7 @@ public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer {
   private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*");
   private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*");
 
-  private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", "true") != null;
+  private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer") != null;
 
   @SuppressWarnings({"NonSerializableFieldInSerializableClass"})
   private final TreeTransformer tagExtender;
@@ -203,7 +203,9 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
     } else if (kids.length > 0) { // ROOT has 1 child - the normal case
       Tree child = kids[0];
       if ( ! child.isPhrasal()) {
-        EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING);
+        if (DEBUG) {
+          EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING);
+        }
         Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids));
         newTree.setChild(0, added);
       } else if (child.label().value().equals("META")) {
@@ -310,6 +312,13 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
       }
     }
 
+    // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree.
+    if (newTree.numChildren() == 0) {
+      if (DEBUG) {
+        EncodingPrintWriter.err.println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.ENCODING);
+      }
+      return null;
+    }
 
     if (tagExtender != null) {
       newTree = tagExtender.transformTree(newTree);

diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/ChineseHeadFinder.java b/src/edu/stanford/nlp/trees/international/pennchinese/ChineseHeadFinder.java
@@ -93,6 +93,15 @@ public ChineseHeadFinder(TreebankLanguagePack tlp) {
 
     // new for ctb6.
     nonTerminalInfo.put("FLR", new String[][]{rightExceptPunct});
+
+    // new for CTB9
+    nonTerminalInfo.put("DFL", new String[][]{rightExceptPunct});
+    nonTerminalInfo.put("EMO", new String[][]{leftExceptPunct}); // left/right doesn't matter
+    nonTerminalInfo.put("INC", new String[][]{leftExceptPunct}); 
+    nonTerminalInfo.put("INTJ", new String[][]{leftExceptPunct}); 
+    nonTerminalInfo.put("OTH", new String[][]{leftExceptPunct}); 
+    nonTerminalInfo.put("SKIP", new String[][]{leftExceptPunct}); 
+
   }
 
   private static final long serialVersionUID = 6143632784691159283L;