Merge branch 'master' into yuhao

stanfordnlp · Sep 17, 2016 · 525f214 · 525f214
1 parent c6bb088
commit 525f214
Show file tree

Hide file tree

Showing 115 changed files with 87,447 additions and 65,688 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,9 +8,13 @@ However, Stanford CoreNLP is copyright by Stanford. (Technically, by The Board o
 In order for us to continue to be able to dual-license Stanford CoreNLP, we need to make sure that contributions from others do not restrict Stanford from separately licensing the code.
 
 Therefore, we can accept contributions on any of the following terms:
+
  * If your contribution is a bug fix of 6 lines or less of new code, we will accept it on the basis that both you and us regard the contribution as de minimis, and not requiring further hassle.
  * You can declare that the contribution is in the public domain (in your commit message or pull request).
  * You can make your contribution available under a non-restrictive open source license, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
- * You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code. You can find these agreements at http://nlp.stanford.edu/software/CLA/ . You can send them to us or contact us at: java-nlp-support@mailman.stanford.edu .
+ * You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code.
+   There is a [Contributor License Agreement for Individuals](http://nlp.stanford.edu/software/CLA/individual.html) and
+   a [Contributor License Agreement for Corporations](http://nlp.stanford.edu/software/CLA/corporate.html).
+   You can send them to us or contact us at: java-nlp-support@lists.stanford.edu .
 
 You should do development against our master branch. The project's source code is in utf-8 character encoding. You should make sure that all unit tests still pass. (In general, you will not be able to run our integration tests, since they rely on resources in our filesystem.)
diff --git a/itest/src/edu/stanford/nlp/dcoref/DcorefBenchmarkSlowITest.java b/itest/src/edu/stanford/nlp/dcoref/DcorefBenchmarkSlowITest.java
@@ -98,40 +98,40 @@ public void testDcoref() throws Exception {
     expectedResults.setCount(MENTION_F1, 50.42);
     highResults.setCount(MENTION_F1, 50.45);
 
-    lowResults.setCount(MUC_TP, 6250);
-    expectedResults.setCount(MUC_TP, 6253);
-    highResults.setCount(MUC_TP, 6260);
+    lowResults.setCount(MUC_TP, 6245);
+    expectedResults.setCount(MUC_TP, 6250);
+    highResults.setCount(MUC_TP, 6255);
     lowResults.setCount(MUC_F1, 60.65);
-    expectedResults.setCount(MUC_F1, 60.67);
+    expectedResults.setCount(MUC_F1, 60.66);
     highResults.setCount(MUC_F1, 60.7);
 
-    lowResults.setCount(BCUBED_TP, 12450);
-    expectedResults.setCount(BCUBED_TP, 12457.63);
-    highResults.setCount(BCUBED_TP, 12460);
-    lowResults.setCount(BCUBED_F1, 70.8);
-    expectedResults.setCount(BCUBED_F1, 70.81);
+    lowResults.setCount(BCUBED_TP, 12440);
+    expectedResults.setCount(BCUBED_TP, 12445.8);
+    highResults.setCount(BCUBED_TP, 12450);
+    lowResults.setCount(BCUBED_F1, 70.75);
+    expectedResults.setCount(BCUBED_F1, 70.80);
     highResults.setCount(BCUBED_F1, 70.85);
 
-    lowResults.setCount(CEAFM_TP, 10920);
-    expectedResults.setCount(CEAFM_TP, 10927);
+    lowResults.setCount(CEAFM_TP, 10915);
+    expectedResults.setCount(CEAFM_TP, 10920);
     highResults.setCount(CEAFM_TP, 10930);
     lowResults.setCount(CEAFM_F1, 59.4);
-    expectedResults.setCount(CEAFM_F1, 59.44);
+    expectedResults.setCount(CEAFM_F1, 59.42);
     highResults.setCount(CEAFM_F1, 59.5);
 
     lowResults.setCount(CEAFE_TP, 3830);
-    expectedResults.setCount(CEAFE_TP, 3833.81);
+    expectedResults.setCount(CEAFE_TP, 3831.36);
     highResults.setCount(CEAFE_TP, 3840);
     lowResults.setCount(CEAFE_F1, 47.4);
-    expectedResults.setCount(CEAFE_F1, 47.46);
+    expectedResults.setCount(CEAFE_F1, 47.45);
     highResults.setCount(CEAFE_F1, 47.5);
 
     lowResults.setCount(BLANC_F1, 75.35);
-    expectedResults.setCount(BLANC_F1, 75.39);
+    expectedResults.setCount(BLANC_F1, 75.38);
     highResults.setCount(BLANC_F1, 75.42);
 
     lowResults.setCount(CONLL_SCORE, 59.6);
-    expectedResults.setCount(CONLL_SCORE, 59.65);
+    expectedResults.setCount(CONLL_SCORE, 59.64);
     highResults.setCount(CONLL_SCORE, 59.7);
 
     Counter<String> results = new ClassicCounter<String>();

diff --git a/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java b/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java
@@ -81,7 +81,7 @@ public void testDependencyParserChineseCoNLLX() {
     Properties props = StringUtils.stringToProperties("language=Chinese");
     DependencyParser parser = new DependencyParser(props);
     parser.loadModelFile("/u/nlp/data/depparser/nn/distrib-2014-10-26/CTB_CoNLL_params.txt.gz");
-    double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/CTB/dev.gold.conll", null);
+    double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/CTB/ctb5.1/dev.gold.conll", null);
     assertEquals(String.format("Chinese CoNLLX gold tags LAS should be %.2f but was %.2f",
             ChineseConllxGoldTagsLas, las), ChineseConllxGoldTagsLas, las, 1e-4);
   }

diff --git a/itest/src/edu/stanford/nlp/pipeline/ArabicSegmenterAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/ArabicSegmenterAnnotatorITest.java
@@ -0,0 +1,44 @@
+package edu.stanford.nlp.pipeline;
+
+import junit.framework.TestCase;
+
+import java.util.List;
+import java.util.Properties;
+
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
+
+public class ArabicSegmenterAnnotatorITest extends TestCase {
+  StanfordCoreNLP pipeline = null;
+
+  @Override
+  public void setUp()
+    throws Exception
+  {
+    if (pipeline != null) {
+      return;
+    }
+    Properties props = new Properties();
+    props.setProperty("annotators", "segment");
+    props.setProperty("customAnnotatorClass.segment", "edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator");
+    props.setProperty("segment.model", "/u/nlp/data/arabic-segmenter/arabic-segmenter-atb+bn+arztrain.ser.gz");
+    pipeline = new StanfordCoreNLP(props);
+  }
+
+  public void testPipeline() {
+    String query = "وما هي كلمتُك المفضلة للدراسة؟";
+    String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "ل", "الدراسة", "?"};
+    int[] expectedStartPositions = {0, 1, 4, 7, 12, 14, 22, 23, 29};
+    int[] expectedEndPositions = {1, 3, 6, 11, 13, 21, 23, 29, 30};
+    Annotation annotation = new Annotation(query);
+    pipeline.annotate(annotation);
+
+    List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
+    assertEquals(expectedWords.length, tokens.size());
+    for (int i = 0; i < expectedWords.length; ++i) {
+      assertEquals(expectedWords[i], tokens.get(i).word());
+      assertEquals(expectedStartPositions[i], tokens.get(i).beginPosition());
+      assertEquals(expectedEndPositions[i], tokens.get(i).endPosition());
+    }
+  }
+}
diff --git a/itest/src/edu/stanford/nlp/pipeline/ChineseAnnotationPipelineITest.java b/itest/src/edu/stanford/nlp/pipeline/ChineseAnnotationPipelineITest.java
@@ -4,7 +4,7 @@
 
 import java.util.List;
 
-import edu.stanford.nlp.ling.ChineseCoreAnnotations.CharactersAnnotation;
+import edu.stanford.nlp.ling.SegmenterCoreAnnotations.CharactersAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.ChineseCharAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.ChineseSegAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;

diff --git a/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java b/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java
@@ -95,6 +95,8 @@ public static void sameAsRead(Annotation doc, Annotation readDoc) {
         CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
         for (int k = 0; k < sentence.get(CoreAnnotations.TokensAnnotation.class).size(); ++k) {
           CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(k);
+          token.remove(TreeCoreAnnotations.HeadWordLabelAnnotation.class);
+          token.remove(TreeCoreAnnotations.HeadTagLabelAnnotation.class);
           // Set docID
           if (doc.containsKey(CoreAnnotations.DocIDAnnotation.class)) { token.setDocID(doc.get(CoreAnnotations.DocIDAnnotation.class)); }
           // Set sentence index if not already there
@@ -222,12 +224,14 @@ private static String[] possibleAnnotators() {
     return annotators.toArray(new String[annotators.size()]);
   }
 
-  private void testAnnotators(String annotators) {
+
+  private void testAnnotators(String annotators, Pair<String,String> additionalProperty) {
     try {
       AnnotationSerializer serializer = new ProtobufAnnotationSerializer();
       // Write
       Annotation doc = new StanfordCoreNLP(new Properties(){{
         setProperty("annotators", annotators);
+        setProperty(additionalProperty.first, additionalProperty.second);
       }}).process(THOROUGH_TEST ? prideAndPrejudiceChapters1 : prideAndPrejudiceFirstBit);
       ByteArrayOutputStream ks = new ByteArrayOutputStream();
       serializer.write(doc, ks).close();
@@ -243,6 +247,10 @@ private void testAnnotators(String annotators) {
     } catch (Exception e) { throw new RuntimeException(e); }
   }
 
+  private void testAnnotators(String annotators) {
+    testAnnotators(annotators, Pair.makePair("__none__", "__none__"));
+  }
+
   /*
   TODO(gabor) serialize the entity mentions
   @Test
@@ -428,6 +436,13 @@ public void testGender() {
     testAnnotators("tokenize,ssplit,pos,lemma,ner,gender");
   }
 
+
+  @Test
+  public void testShiftReduce() {
+    testAnnotators("tokenize,ssplit,pos,parse",
+        Pair.makePair("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz"));
+  }
+
   /**
    * Is the protobuf annotator "CoreNLP complete?"
    * That is, does it effectively save every combination of annotators possible?

diff --git a/scripts/nndep/Makefile b/scripts/nndep/Makefile
@@ -11,9 +11,13 @@ PTB_CONLL_TRAIN=${DATA_DIR}/PTB/CoNLL/train.conll
 PTB_CONLL_DEV=${DATA_DIR}/PTB/CoNLL/dev.conll
 PTB_CONLL_TEST=${DATA_DIR}/PTB/CoNLL/test.conll
 
-CTB_CONLL_TRAIN=${DATA_DIR}/CTB/train.gold.conll
-CTB_CONLL_DEV=${DATA_DIR}/CTB/dev.gold.conll
-CTB_CONLL_TEST=${DATA_DIR}/CTB/test.gold.conll
+CTB5_CONLL_TRAIN=${DATA_DIR}/CTB/ctb5.1/train.gold.conll
+CTB5_CONLL_DEV=${DATA_DIR}/CTB/ctb5.1/dev.gold.conll
+CTB5_CONLL_TEST=${DATA_DIR}/CTB/ctb5.1/test.gold.conll
+
+CTB9_CONLL_TRAIN=${DATA_DIR}/CTB/ctb9.0/chtb.train.conll
+CTB9_CONLL_DEV=${DATA_DIR}/CTB/ctb9.0/chtb.dev.conll
+CTB9_CONLL_TEST=${DATA_DIR}/CTB/ctb9.0/chtb.test.conll
 
 UD_GERMAN_TRAIN=${DATA_DIR}/UD/1.1/de/de-ud-train-clean.conllu
 UD_GERMAN_DEV=${DATA_DIR}/UD/1.1/de/de-ud-dev-clean.conllu
@@ -37,10 +41,16 @@ PTB_CoNLL:
 	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(PTB_CONLL_DEV) -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
 	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(PTB_CONLL_TEST) -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
 
-CTB_CoNLL:
-	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB_CONLL_TRAIN) -language Chinese -devFile $(CTB_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
-	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
-	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
+CTB5_CoNLL:
+	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB5_CONLL_TRAIN) -language Chinese -devFile $(CTB5_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB5_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB5_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
+
+CTB9_CoNLL:
+	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB9_CONLL_TRAIN) -language Chinese -devFile $(CTB9_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB9_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB9_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
+
 
 UD_GERMAN:
 	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_GERMAN_TRAIN) -language German -devFile $(UD_GERMAN_DEV) -model $@.txt.gz >> $@.log 2>&1

diff --git a/src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java b/src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java
@@ -84,7 +84,6 @@ public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics sema
     singletonPredictor = singletonModel;
   }
 
-  private static final boolean includeExtras = false;
   private static final boolean LEMMATIZE = true;
   private static final boolean threadSafe = true;
 
@@ -118,13 +117,10 @@ public Document nextDoc() throws Exception {
         }
         // generate the dependency graph
         try {
-          SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree,
-              SemanticGraphFactory.Mode.COLLAPSED, includeExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, threadSafe, null, true);
-          SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree,
-              SemanticGraphFactory.Mode.BASIC, includeExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, threadSafe, null, true);
+          SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.ENHANCED, GrammaticalStructure.Extras.NONE, threadSafe);
+          SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.BASIC, GrammaticalStructure.Extras.NONE, threadSafe);
           sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
-          sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
-          sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, deps);
+          sentence.set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, deps);
         } catch(Exception e) {
           logger.log(Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e);
         }
@@ -267,8 +263,8 @@ public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document
         // will be set by arrange
         mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);
 
-        // Mention dependency is collapsed dependency for sentence
-        mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
+        // Mention dependency graph is the enhanced dependency graph of the sentence
+        mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
 
         allGoldMentions.get(sentIndex).add(mention);
       }

diff --git a/src/edu/stanford/nlp/dcoref/Dictionaries.java b/src/edu/stanford/nlp/dcoref/Dictionaries.java
@@ -181,8 +181,10 @@ public enum Person { I, YOU, HE, SHE, WE, THEY, IT, UNKNOWN}
       "if", "false", "fallacy", "unsuccessfully", "unlikely", "impossible", "improbable", "uncertain", "unsure", "impossibility", "improbability", "cancellation", "breakup", "lack",
       "long-stalled", "end", "rejection", "failure", "avoid", "bar", "block", "break", "cancel", "cease", "cut", "decline", "deny", "deprive", "destroy", "excuse",
       "fail", "forbid", "forestall", "forget", "halt", "lose", "nullify", "prevent", "refrain", "reject", "rebut", "remain", "refuse", "stop", "suspend", "ward"));
-  public final Set<String> neg_relations = Generics.newHashSet(Arrays.asList("prep_without", "prepc_without", "prep_except", "prepc_except", "prep_excluding", "prepx_excluding",
-      "prep_if", "prepc_if", "prep_whether", "prepc_whether", "prep_away_from", "prepc_away_from", "prep_instead_of", "prepc_instead_of"));
+  public final Set<String> neg_relations = Generics.newHashSet(Arrays.asList("nmod:without", "acl:without", "advcl:without",
+      "nmod:except", "acl:except", "advcl:except", "nmod:excluding", "acl:excluding", "advcl:excluding", "nmod:if", "acl:if",
+      "advcl:if", "nmod:whether", "acl:whether", "advcl:whether",  "nmod:away_from", "acl:away_from", "advcl:away_fom",
+      "nmod:instead_of", "acl:instead_of", "advcl:instead_of"));
   public final Set<String> modals = Generics.newHashSet(Arrays.asList("can", "could", "may", "might", "must", "should", "would", "seem",
       "able", "apparently", "necessarily", "presumably", "probably", "possibly", "reportedly", "supposedly",
       "inconceivable", "chance", "impossibility", "improbability", "encouragement", "improbable", "impossible",

diff --git a/src/edu/stanford/nlp/dcoref/Document.java b/src/edu/stanford/nlp/dcoref/Document.java
@@ -705,7 +705,7 @@ private boolean findSpeaker(int utterNum, int sentNum, List<CoreMap> sentences,
       String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
       if(dict.reportVerb.contains(lemma)) {
         // find subject
-        SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
+        SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
         IndexedWord w = dependency.getNodeByWordPattern(word);
 
         if (w != null) {
@@ -802,7 +802,7 @@ private String findNextParagraphSpeaker(List<CoreMap> paragraph, int paragraphOf
     for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
       if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
         String word = w.get(CoreAnnotations.TextAnnotation.class);
-        SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
+        SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
         IndexedWord t = dependency.getNodeByWordPattern(word);
 
         for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){

diff --git a/src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java b/src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java
@@ -272,7 +272,7 @@ else if (w.equals("</COREF>")) {
       List<CoreLabel> unannotatedSent = allWords.get(i);
       List<Mention> mentionInSent = allGoldMentions.get(i);
       for (Mention m : mentionInSent){
-        m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
+        m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
       }
       if(annotatedSent.size() != unannotatedSent.size()){
         throw new IllegalStateException("annotatedSent != unannotatedSent");