Skip to content

Commit

Permalink
Merge branch 'master' into yuhao
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhaozhang authored and Stanford NLP committed Sep 17, 2016
1 parent c6bb088 commit 525f214
Show file tree
Hide file tree
Showing 115 changed files with 87,447 additions and 65,688 deletions.
6 changes: 5 additions & 1 deletion CONTRIBUTING.md
Expand Up @@ -8,9 +8,13 @@ However, Stanford CoreNLP is copyright by Stanford. (Technically, by The Board o
In order for us to continue to be able to dual-license Stanford CoreNLP, we need to make sure that contributions from others do not restrict Stanford from separately licensing the code.

Therefore, we can accept contributions on any of the following terms:

* If your contribution is a bug fix of 6 lines or less of new code, we will accept it on the basis that both you and us regard the contribution as de minimis, and not requiring further hassle.
* You can declare that the contribution is in the public domain (in your commit message or pull request).
* You can make your contribution available under a non-restrictive open source license, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
* You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code. You can find these agreements at http://nlp.stanford.edu/software/CLA/ . You can send them to us or contact us at: java-nlp-support@mailman.stanford.edu .
* You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code.
There is a [Contributor License Agreement for Individuals](http://nlp.stanford.edu/software/CLA/individual.html) and
a [Contributor License Agreement for Corporations](http://nlp.stanford.edu/software/CLA/corporate.html).
You can send them to us or contact us at: java-nlp-support@lists.stanford.edu .

You should do development against our master branch. The project's source code is in utf-8 character encoding. You should make sure that all unit tests still pass. (In general, you will not be able to run our integration tests, since they rely on resources in our filesystem.)
32 changes: 16 additions & 16 deletions itest/src/edu/stanford/nlp/dcoref/DcorefBenchmarkSlowITest.java
Expand Up @@ -98,40 +98,40 @@ public void testDcoref() throws Exception {
expectedResults.setCount(MENTION_F1, 50.42);
highResults.setCount(MENTION_F1, 50.45);

lowResults.setCount(MUC_TP, 6250);
expectedResults.setCount(MUC_TP, 6253);
highResults.setCount(MUC_TP, 6260);
lowResults.setCount(MUC_TP, 6245);
expectedResults.setCount(MUC_TP, 6250);
highResults.setCount(MUC_TP, 6255);
lowResults.setCount(MUC_F1, 60.65);
expectedResults.setCount(MUC_F1, 60.67);
expectedResults.setCount(MUC_F1, 60.66);
highResults.setCount(MUC_F1, 60.7);

lowResults.setCount(BCUBED_TP, 12450);
expectedResults.setCount(BCUBED_TP, 12457.63);
highResults.setCount(BCUBED_TP, 12460);
lowResults.setCount(BCUBED_F1, 70.8);
expectedResults.setCount(BCUBED_F1, 70.81);
lowResults.setCount(BCUBED_TP, 12440);
expectedResults.setCount(BCUBED_TP, 12445.8);
highResults.setCount(BCUBED_TP, 12450);
lowResults.setCount(BCUBED_F1, 70.75);
expectedResults.setCount(BCUBED_F1, 70.80);
highResults.setCount(BCUBED_F1, 70.85);

lowResults.setCount(CEAFM_TP, 10920);
expectedResults.setCount(CEAFM_TP, 10927);
lowResults.setCount(CEAFM_TP, 10915);
expectedResults.setCount(CEAFM_TP, 10920);
highResults.setCount(CEAFM_TP, 10930);
lowResults.setCount(CEAFM_F1, 59.4);
expectedResults.setCount(CEAFM_F1, 59.44);
expectedResults.setCount(CEAFM_F1, 59.42);
highResults.setCount(CEAFM_F1, 59.5);

lowResults.setCount(CEAFE_TP, 3830);
expectedResults.setCount(CEAFE_TP, 3833.81);
expectedResults.setCount(CEAFE_TP, 3831.36);
highResults.setCount(CEAFE_TP, 3840);
lowResults.setCount(CEAFE_F1, 47.4);
expectedResults.setCount(CEAFE_F1, 47.46);
expectedResults.setCount(CEAFE_F1, 47.45);
highResults.setCount(CEAFE_F1, 47.5);

lowResults.setCount(BLANC_F1, 75.35);
expectedResults.setCount(BLANC_F1, 75.39);
expectedResults.setCount(BLANC_F1, 75.38);
highResults.setCount(BLANC_F1, 75.42);

lowResults.setCount(CONLL_SCORE, 59.6);
expectedResults.setCount(CONLL_SCORE, 59.65);
expectedResults.setCount(CONLL_SCORE, 59.64);
highResults.setCount(CONLL_SCORE, 59.7);

Counter<String> results = new ClassicCounter<String>();
Expand Down
Expand Up @@ -81,7 +81,7 @@ public void testDependencyParserChineseCoNLLX() {
Properties props = StringUtils.stringToProperties("language=Chinese");
DependencyParser parser = new DependencyParser(props);
parser.loadModelFile("/u/nlp/data/depparser/nn/distrib-2014-10-26/CTB_CoNLL_params.txt.gz");
double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/CTB/dev.gold.conll", null);
double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/CTB/ctb5.1/dev.gold.conll", null);
assertEquals(String.format("Chinese CoNLLX gold tags LAS should be %.2f but was %.2f",
ChineseConllxGoldTagsLas, las), ChineseConllxGoldTagsLas, las, 1e-4);
}
Expand Down
@@ -0,0 +1,44 @@
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;

public class ArabicSegmenterAnnotatorITest extends TestCase {
StanfordCoreNLP pipeline = null;

@Override
public void setUp()
throws Exception
{
if (pipeline != null) {
return;
}
Properties props = new Properties();
props.setProperty("annotators", "segment");
props.setProperty("customAnnotatorClass.segment", "edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator");
props.setProperty("segment.model", "/u/nlp/data/arabic-segmenter/arabic-segmenter-atb+bn+arztrain.ser.gz");
pipeline = new StanfordCoreNLP(props);
}

public void testPipeline() {
String query = "وما هي كلمتُك المفضلة للدراسة؟";
String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "ل", "الدراسة", "?"};
int[] expectedStartPositions = {0, 1, 4, 7, 12, 14, 22, 23, 29};
int[] expectedEndPositions = {1, 3, 6, 11, 13, 21, 23, 29, 30};
Annotation annotation = new Annotation(query);
pipeline.annotate(annotation);

List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
assertEquals(expectedWords.length, tokens.size());
for (int i = 0; i < expectedWords.length; ++i) {
assertEquals(expectedWords[i], tokens.get(i).word());
assertEquals(expectedStartPositions[i], tokens.get(i).beginPosition());
assertEquals(expectedEndPositions[i], tokens.get(i).endPosition());
}
}
}
Expand Up @@ -4,7 +4,7 @@

import java.util.List;

import edu.stanford.nlp.ling.ChineseCoreAnnotations.CharactersAnnotation;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations.CharactersAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseCharAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseSegAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
Expand Down
Expand Up @@ -95,6 +95,8 @@ public static void sameAsRead(Annotation doc, Annotation readDoc) {
CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
for (int k = 0; k < sentence.get(CoreAnnotations.TokensAnnotation.class).size(); ++k) {
CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(k);
token.remove(TreeCoreAnnotations.HeadWordLabelAnnotation.class);
token.remove(TreeCoreAnnotations.HeadTagLabelAnnotation.class);
// Set docID
if (doc.containsKey(CoreAnnotations.DocIDAnnotation.class)) { token.setDocID(doc.get(CoreAnnotations.DocIDAnnotation.class)); }
// Set sentence index if not already there
Expand Down Expand Up @@ -222,12 +224,14 @@ private static String[] possibleAnnotators() {
return annotators.toArray(new String[annotators.size()]);
}

private void testAnnotators(String annotators) {

private void testAnnotators(String annotators, Pair<String,String> additionalProperty) {
try {
AnnotationSerializer serializer = new ProtobufAnnotationSerializer();
// Write
Annotation doc = new StanfordCoreNLP(new Properties(){{
setProperty("annotators", annotators);
setProperty(additionalProperty.first, additionalProperty.second);
}}).process(THOROUGH_TEST ? prideAndPrejudiceChapters1 : prideAndPrejudiceFirstBit);
ByteArrayOutputStream ks = new ByteArrayOutputStream();
serializer.write(doc, ks).close();
Expand All @@ -243,6 +247,10 @@ private void testAnnotators(String annotators) {
} catch (Exception e) { throw new RuntimeException(e); }
}

private void testAnnotators(String annotators) {
testAnnotators(annotators, Pair.makePair("__none__", "__none__"));
}

/*
TODO(gabor) serialize the entity mentions
@Test
Expand Down Expand Up @@ -428,6 +436,13 @@ public void testGender() {
testAnnotators("tokenize,ssplit,pos,lemma,ner,gender");
}


@Test
public void testShiftReduce() {
testAnnotators("tokenize,ssplit,pos,parse",
Pair.makePair("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz"));
}

/**
* Is the protobuf annotator "CoreNLP complete?"
* That is, does it effectively save every combination of annotators possible?
Expand Down
24 changes: 17 additions & 7 deletions scripts/nndep/Makefile
Expand Up @@ -11,9 +11,13 @@ PTB_CONLL_TRAIN=${DATA_DIR}/PTB/CoNLL/train.conll
PTB_CONLL_DEV=${DATA_DIR}/PTB/CoNLL/dev.conll
PTB_CONLL_TEST=${DATA_DIR}/PTB/CoNLL/test.conll

CTB_CONLL_TRAIN=${DATA_DIR}/CTB/train.gold.conll
CTB_CONLL_DEV=${DATA_DIR}/CTB/dev.gold.conll
CTB_CONLL_TEST=${DATA_DIR}/CTB/test.gold.conll
CTB5_CONLL_TRAIN=${DATA_DIR}/CTB/ctb5.1/train.gold.conll
CTB5_CONLL_DEV=${DATA_DIR}/CTB/ctb5.1/dev.gold.conll
CTB5_CONLL_TEST=${DATA_DIR}/CTB/ctb5.1/test.gold.conll

CTB9_CONLL_TRAIN=${DATA_DIR}/CTB/ctb9.0/chtb.train.conll
CTB9_CONLL_DEV=${DATA_DIR}/CTB/ctb9.0/chtb.dev.conll
CTB9_CONLL_TEST=${DATA_DIR}/CTB/ctb9.0/chtb.test.conll

UD_GERMAN_TRAIN=${DATA_DIR}/UD/1.1/de/de-ud-train-clean.conllu
UD_GERMAN_DEV=${DATA_DIR}/UD/1.1/de/de-ud-dev-clean.conllu
Expand All @@ -37,10 +41,16 @@ PTB_CoNLL:
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(PTB_CONLL_DEV) -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(PTB_CONLL_TEST) -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1

CTB_CoNLL:
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB_CONLL_TRAIN) -language Chinese -devFile $(CTB_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
CTB5_CoNLL:
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB5_CONLL_TRAIN) -language Chinese -devFile $(CTB5_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB5_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB5_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1

CTB9_CoNLL:
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(CTB9_CONLL_TRAIN) -language Chinese -devFile $(CTB9_CONLL_DEV) -embedFile $(CHINESE_EMBEDDINGS) -model $@.txt.gz >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB9_CONLL_DEV) -language Chinese -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(CTB9_CONLL_TEST) -language Chinese -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1


UD_GERMAN:
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_GERMAN_TRAIN) -language German -devFile $(UD_GERMAN_DEV) -model $@.txt.gz >> $@.log 2>&1
Expand Down
14 changes: 5 additions & 9 deletions src/edu/stanford/nlp/dcoref/CoNLLMentionExtractor.java
Expand Up @@ -84,7 +84,6 @@ public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics sema
singletonPredictor = singletonModel;
}

private static final boolean includeExtras = false;
private static final boolean LEMMATIZE = true;
private static final boolean threadSafe = true;

Expand Down Expand Up @@ -118,13 +117,10 @@ public Document nextDoc() throws Exception {
}
// generate the dependency graph
try {
SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree,
SemanticGraphFactory.Mode.COLLAPSED, includeExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, threadSafe, null, true);
SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree,
SemanticGraphFactory.Mode.BASIC, includeExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, threadSafe, null, true);
SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.ENHANCED, GrammaticalStructure.Extras.NONE, threadSafe);
SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.BASIC, GrammaticalStructure.Extras.NONE, threadSafe);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, deps);
sentence.set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, deps);
} catch(Exception e) {
logger.log(Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e);
}
Expand Down Expand Up @@ -267,8 +263,8 @@ public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document
// will be set by arrange
mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);

// Mention dependency is collapsed dependency for sentence
mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
// Mention dependency graph is the enhanced dependency graph of the sentence
mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);

allGoldMentions.get(sentIndex).add(mention);
}
Expand Down
6 changes: 4 additions & 2 deletions src/edu/stanford/nlp/dcoref/Dictionaries.java
Expand Up @@ -181,8 +181,10 @@ public enum Person { I, YOU, HE, SHE, WE, THEY, IT, UNKNOWN}
"if", "false", "fallacy", "unsuccessfully", "unlikely", "impossible", "improbable", "uncertain", "unsure", "impossibility", "improbability", "cancellation", "breakup", "lack",
"long-stalled", "end", "rejection", "failure", "avoid", "bar", "block", "break", "cancel", "cease", "cut", "decline", "deny", "deprive", "destroy", "excuse",
"fail", "forbid", "forestall", "forget", "halt", "lose", "nullify", "prevent", "refrain", "reject", "rebut", "remain", "refuse", "stop", "suspend", "ward"));
public final Set<String> neg_relations = Generics.newHashSet(Arrays.asList("prep_without", "prepc_without", "prep_except", "prepc_except", "prep_excluding", "prepx_excluding",
"prep_if", "prepc_if", "prep_whether", "prepc_whether", "prep_away_from", "prepc_away_from", "prep_instead_of", "prepc_instead_of"));
public final Set<String> neg_relations = Generics.newHashSet(Arrays.asList("nmod:without", "acl:without", "advcl:without",
"nmod:except", "acl:except", "advcl:except", "nmod:excluding", "acl:excluding", "advcl:excluding", "nmod:if", "acl:if",
"advcl:if", "nmod:whether", "acl:whether", "advcl:whether", "nmod:away_from", "acl:away_from", "advcl:away_fom",
"nmod:instead_of", "acl:instead_of", "advcl:instead_of"));
public final Set<String> modals = Generics.newHashSet(Arrays.asList("can", "could", "may", "might", "must", "should", "would", "seem",
"able", "apparently", "necessarily", "presumably", "probably", "possibly", "reportedly", "supposedly",
"inconceivable", "chance", "impossibility", "improbability", "encouragement", "improbable", "impossible",
Expand Down
4 changes: 2 additions & 2 deletions src/edu/stanford/nlp/dcoref/Document.java
Expand Up @@ -705,7 +705,7 @@ private boolean findSpeaker(int utterNum, int sentNum, List<CoreMap> sentences,
String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
if(dict.reportVerb.contains(lemma)) {
// find subject
SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
IndexedWord w = dependency.getNodeByWordPattern(word);

if (w != null) {
Expand Down Expand Up @@ -802,7 +802,7 @@ private String findNextParagraphSpeaker(List<CoreMap> paragraph, int paragraphOf
for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
String word = w.get(CoreAnnotations.TextAnnotation.class);
SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
IndexedWord t = dependency.getNodeByWordPattern(word);

for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){
Expand Down
2 changes: 1 addition & 1 deletion src/edu/stanford/nlp/dcoref/MUCMentionExtractor.java
Expand Up @@ -272,7 +272,7 @@ else if (w.equals("</COREF>")) {
List<CoreLabel> unannotatedSent = allWords.get(i);
List<Mention> mentionInSent = allGoldMentions.get(i);
for (Mention m : mentionInSent){
m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class);
m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
}
if(annotatedSent.size() != unannotatedSent.size()){
throw new IllegalStateException("annotatedSent != unannotatedSent");
Expand Down

0 comments on commit 525f214

Please sign in to comment.