Skip to content

Commit

Permalink
add ud/feature_map.txt to model generation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
sebschu authored and Stanford NLP committed Oct 21, 2015
1 parent 634b55b commit 240cfbf
Show file tree
Hide file tree
Showing 59 changed files with 71,333 additions and 74,228 deletions.
2 changes: 1 addition & 1 deletion data/edu/stanford/nlp/upos/ENUniversalPOS.tsurgeon
Expand Up @@ -64,7 +64,7 @@ relabel target AUX
%relabel target AUX %relabel target AUX


% VB.* -> AUX (active, case 1) % VB.* -> AUX (active, case 1)
VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/) VP < VP < (/^VB.*$/=target <... {/.*/})


relabel target AUX relabel target AUX


Expand Down
Expand Up @@ -97,7 +97,6 @@ public void testSimpleSentenceJSON() throws IOException {
" {\n" + " {\n" +
" \"index\": 1,\n" + " \"index\": 1,\n" +
" \"word\": \"Bad\",\n" + " \"word\": \"Bad\",\n" +
" \"originalText\": \"Bad\",\n" +
" \"lemma\": \"bad\",\n" + " \"lemma\": \"bad\",\n" +
" \"characterOffsetBegin\": 0,\n" + " \"characterOffsetBegin\": 0,\n" +
" \"characterOffsetEnd\": 3,\n" + " \"characterOffsetEnd\": 3,\n" +
Expand All @@ -109,7 +108,6 @@ public void testSimpleSentenceJSON() throws IOException {
" {\n" + " {\n" +
" \"index\": 2,\n" + " \"index\": 2,\n" +
" \"word\": \"wolf\",\n" + " \"word\": \"wolf\",\n" +
" \"originalText\": \"wolf\",\n" +
" \"lemma\": \"wolf\",\n" + " \"lemma\": \"wolf\",\n" +
" \"characterOffsetBegin\": 4,\n" + " \"characterOffsetBegin\": 4,\n" +
" \"characterOffsetEnd\": 8,\n" + " \"characterOffsetEnd\": 8,\n" +
Expand Down
38 changes: 0 additions & 38 deletions itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java

This file was deleted.

@@ -1,6 +1,6 @@
package edu.stanford.nlp.pipeline; package edu.stanford.nlp.pipeline;


import edu.stanford.nlp.hcoref.CorefCoreAnnotations; import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ie.NumberNormalizer; import edu.stanford.nlp.ie.NumberNormalizer;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations; import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
Expand Down Expand Up @@ -405,11 +405,6 @@ public void testRelation() {
testAnnotators("tokenize,ssplit,pos,lemma,ner,parse,relation"); testAnnotators("tokenize,ssplit,pos,lemma,ner,parse,relation");
} }


@Test
public void testUDFeats() {
testAnnotators("tokenize,ssplit,pos,depparse,udfeats");
}

@Test @Test
public void testSerializeSSplitTokensRegression() { public void testSerializeSSplitTokensRegression() {
testAnnotators("tokenize,ssplit"); testAnnotators("tokenize,ssplit");
Expand Down Expand Up @@ -468,7 +463,7 @@ public void testAllAnnotatorCombinations() {
if (!annotatorsToConsider.isEmpty()) { continue; } // continue if we couldn't add all the annotators if (!annotatorsToConsider.isEmpty()) { continue; } // continue if we couldn't add all the annotators


// Create pipeline // Create pipeline
if (!annotators.contains("dcoref") && !annotators.contains("entitymentions")) { // TODO(gabor) eventually, don't ignore entitymentions! if (!annotators.contains("hcoref") && !annotators.contains("entitymentions")) { // TODO(gabor) eventually, don't ignore this!
System.err.println(">>TESTING " + StringUtils.join(annotators, ",")); System.err.println(">>TESTING " + StringUtils.join(annotators, ","));
testAnnotators(StringUtils.join(annotators, ",")); testAnnotators(StringUtils.join(annotators, ","));
} }
Expand Down
2 changes: 1 addition & 1 deletion src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java
Expand Up @@ -73,7 +73,7 @@ public Class<Integer> getType() {
* CoreLabel. Note that the list includes the CoreLabel that was * CoreLabel. Note that the list includes the CoreLabel that was
* annotated which creates a cycle. * annotated which creates a cycle.
* *
* @deprecated This was an original dcoref annotation. You should now use CorefChainAnnotation * @deprecated This was an original dcoref annotation. You should know use CorefChainAnnotation
*/ */
@Deprecated @Deprecated
public static class CorefClusterAnnotation implements CoreAnnotation<Set<CoreLabel>> { public static class CorefClusterAnnotation implements CoreAnnotation<Set<CoreLabel>> {
Expand Down
17 changes: 4 additions & 13 deletions src/edu/stanford/nlp/hcoref/CorefDocMaker.java
Expand Up @@ -49,10 +49,7 @@ public class CorefDocMaker {
StanfordCoreNLP corenlp; StanfordCoreNLP corenlp;
final TreeLemmatizer treeLemmatizer; final TreeLemmatizer treeLemmatizer;
LogisticClassifier<String, String> singletonPredictor; LogisticClassifier<String, String> singletonPredictor;
// Should we call corenlp to add missing annotations?
// HACK so that when the CorefDocMaker is called from annotator, it doesn't override old annotations
boolean needMissingAnnotations = true;

public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException { public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException {
this.props = props; this.props = props;
this.dict = dictionaries; this.dict = dictionaries;
Expand All @@ -65,11 +62,7 @@ public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNo
singletonPredictor = (CorefProperties.useSingletonPredictor(props))? singletonPredictor = (CorefProperties.useSingletonPredictor(props))?
getSingletonPredictorFromSerializedFile(CorefProperties.getPathSingletonPredictor(props)) : null; getSingletonPredictorFromSerializedFile(CorefProperties.getPathSingletonPredictor(props)) : null;
} }


public void setNeedMissingAnnotations(boolean needMissingAnnotations) {
this.needMissingAnnotations = needMissingAnnotations;
}

/** Load Stanford Processor: skip unnecessary annotator */ /** Load Stanford Processor: skip unnecessary annotator */
protected StanfordCoreNLP loadStanfordProcessor(Properties props) { protected StanfordCoreNLP loadStanfordProcessor(Properties props) {


Expand Down Expand Up @@ -153,16 +146,14 @@ public Document makeDocument(Annotation anno) throws Exception {
/** /**
* Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)).
* Mention detection and document preprocessing is done here. * Mention detection and document preprocessing is done here.
* @throws Exception * @throws Exception
*/ */
public Document makeDocument(InputDoc input) throws Exception { public Document makeDocument(InputDoc input) throws Exception {
if (input == null) return null; if (input == null) return null;
Annotation anno = input.annotation; Annotation anno = input.annotation;


// add missing annotation // add missing annotation
if (needMissingAnnotations) { addMissingAnnotation(anno);
addMissingAnnotation(anno);
}


if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) { if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
Expand Down
6 changes: 3 additions & 3 deletions src/edu/stanford/nlp/hcoref/CorefProperties.java
Expand Up @@ -195,8 +195,8 @@ public static String getPathSingletonPredictor(Properties props) {
return PropertiesUtils.getString(props, PATH_SINGLETON_PREDICTOR_PROP, "edu/stanford/nlp/models/dcoref/singleton.predictor.ser"); return PropertiesUtils.getString(props, PATH_SINGLETON_PREDICTOR_PROP, "edu/stanford/nlp/models/dcoref/singleton.predictor.ser");
} }
public static String getPathModel(Properties props, String sievename) { public static String getPathModel(Properties props, String sievename) {
return props.getProperty(PATH_SERIALIZED_PROP) + File.separator + return new File(props.getProperty(PATH_SERIALIZED_PROP),
props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename); props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename)).getAbsolutePath();
} }
public static boolean debug(Properties props) { public static boolean debug(Properties props) {
return PropertiesUtils.getBool(props, DEBUG_PROP, false); return PropertiesUtils.getBool(props, DEBUG_PROP, false);
Expand Down Expand Up @@ -325,7 +325,7 @@ public static boolean useSemantics(Properties props) {
return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, true); return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, true);
} }
public static String getPathSerializedWordVectors(Properties props) { public static String getPathSerializedWordVectors(Properties props) {
return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser.gz"); return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser");
} }
public static String getCurrentSieveForTrain(Properties props) { public static String getCurrentSieveForTrain(Properties props) {
return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null); return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null);
Expand Down
18 changes: 8 additions & 10 deletions src/edu/stanford/nlp/hcoref/Preprocessor.java
Expand Up @@ -29,12 +29,13 @@
import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.HeadFinder; import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.CollectionValuedMap; import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Generics;
Expand Down Expand Up @@ -125,7 +126,6 @@ protected static int getHeadIndex(Tree t, HeadFinder headFinder) {


private static List<Mention> mentionReorderingBySpan(List<Mention> mentionsInSent) { private static List<Mention> mentionReorderingBySpan(List<Mention> mentionsInSent) {
TreeSet<Mention> ordering = new TreeSet<Mention>(new Comparator<Mention>(){ TreeSet<Mention> ordering = new TreeSet<Mention>(new Comparator<Mention>(){
@Override
public int compare(Mention m1, Mention m2) { public int compare(Mention m1, Mention m2) {
return (m1.appearEarlierThan(m2))? -1 : (m2.appearEarlierThan(m1))? 1 : 0; return (m1.appearEarlierThan(m2))? -1 : (m2.appearEarlierThan(m1))? 1 : 0;
} }
Expand Down Expand Up @@ -298,7 +298,7 @@ private static void fillMentionInfo(Document doc, Dictionaries dict,
m.contextParseTree = sentence.get(TreeAnnotation.class); m.contextParseTree = sentence.get(TreeAnnotation.class);
// m.sentenceWords = sentence.get(TokensAnnotation.class); // m.sentenceWords = sentence.get(TokensAnnotation.class);
m.basicDependency = sentence.get(BasicDependenciesAnnotation.class); m.basicDependency = sentence.get(BasicDependenciesAnnotation.class);
m.collapsedDependency = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); m.collapsedDependency = sentence.get(CollapsedDependenciesAnnotation.class);


// mentionSubTree (highest NP that has the same head) if constituency tree available // mentionSubTree (highest NP that has the same head) if constituency tree available
if (m.contextParseTree != null) { if (m.contextParseTree != null) {
Expand Down Expand Up @@ -343,7 +343,7 @@ private static void findSyntacticRelationsFromDependency(List<Mention> orderedMe


// apposition // apposition
Set<Pair<Integer, Integer>> appos = Generics.newHashSet(); Set<Pair<Integer, Integer>> appos = Generics.newHashSet();
List<SemanticGraphEdge> appositions = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.APPOSITIONAL_MODIFIER); List<SemanticGraphEdge> appositions = dependency.findAllRelns(EnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
for(SemanticGraphEdge edge : appositions) { for(SemanticGraphEdge edge : appositions) {
int sIdx = edge.getSource().index()-1; int sIdx = edge.getSource().index()-1;
int tIdx = edge.getTarget().index()-1; int tIdx = edge.getTarget().index()-1;
Expand All @@ -353,18 +353,18 @@ private static void findSyntacticRelationsFromDependency(List<Mention> orderedMe


// predicate nominatives // predicate nominatives
Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet(); Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet();
List<SemanticGraphEdge> copula = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.COPULA); List<SemanticGraphEdge> copula = dependency.findAllRelns(EnglishGrammaticalRelations.COPULA);
for(SemanticGraphEdge edge : copula) { for(SemanticGraphEdge edge : copula) {
IndexedWord source = edge.getSource(); IndexedWord source = edge.getSource();
IndexedWord target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT); IndexedWord target = dependency.getChildWithReln(source, EnglishGrammaticalRelations.NOMINAL_SUBJECT);
if(target==null) target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.CLAUSAL_SUBJECT); if(target==null) target = dependency.getChildWithReln(source, EnglishGrammaticalRelations.CLAUSAL_SUBJECT);
// TODO // TODO
if(target == null) continue; if(target == null) continue;


// to handle relative clause: e.g., Tim who is a student, // to handle relative clause: e.g., Tim who is a student,
if(target.tag().startsWith("W")) { if(target.tag().startsWith("W")) {
IndexedWord parent = dependency.getParent(source); IndexedWord parent = dependency.getParent(source);
if(parent!=null && dependency.reln(parent, source).equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) { if(parent!=null && dependency.reln(parent, source).equals(EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) {
target = parent; target = parent;
} }
} }
Expand Down Expand Up @@ -412,7 +412,6 @@ private static DocType findDocType(Document doc) {
if(!speakerChange) return DocType.ARTICLE; if(!speakerChange) return DocType.ARTICLE;
return DocType.CONVERSATION; // in conversation, utter index keep increasing. return DocType.CONVERSATION; // in conversation, utter index keep increasing.
} }

/** Set paragraph index */ /** Set paragraph index */
private static void setParagraphAnnotation(Document doc) { private static void setParagraphAnnotation(Document doc) {
int paragraphIndex = 0; int paragraphIndex = 0;
Expand Down Expand Up @@ -630,7 +629,6 @@ private static void findSpeakers(Document doc, Dictionaries dict) {
} }
} }
} }

private static void findSpeakersInArticle(Document doc, Dictionaries dict) { private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class); List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
IntPair beginQuotation = null; IntPair beginQuotation = null;
Expand Down
37 changes: 19 additions & 18 deletions src/edu/stanford/nlp/hcoref/data/Dictionaries.java
Expand Up @@ -15,7 +15,7 @@
import edu.stanford.nlp.hcoref.CorefProperties; import edu.stanford.nlp.hcoref.CorefProperties;
import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.neural.VectorMap; import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.pipeline.DefaultPaths; import edu.stanford.nlp.pipeline.DefaultPaths;
import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counter;
Expand Down Expand Up @@ -202,7 +202,7 @@ private void readWordLists(Locale lang) {


public int dimVector; public int dimVector;


public VectorMap vectors = new VectorMap(); public Map<String, float[]> vectors = Generics.newHashMap();


public Map<String, String> strToEntity = Generics.newHashMap(); public Map<String, String> strToEntity = Generics.newHashMap();
public Counter<String> dictScore = new ClassicCounter<String>(); public Counter<String> dictScore = new ClassicCounter<String>();
Expand Down Expand Up @@ -535,23 +535,24 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc
if(CorefProperties.loadWordEmbedding(props)) { if(CorefProperties.loadWordEmbedding(props)) {
System.err.println("LOAD: WordVectors"); System.err.println("LOAD: WordVectors");
String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props); String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props);
String word2vecFile = CorefProperties.getPathWord2Vec(props); if(new File(wordvectorFile).exists()) {
try { vectors = IOUtils.readObjectFromFile(wordvectorFile);
// Try to read the serialized vectors dimVector = vectors.entrySet().iterator().next().getValue().length;
vectors = VectorMap.deserialize(wordvectorFile); } else {
} catch (IOException e) { for(String line : IOUtils.readLines(CorefProperties.getPathWord2Vec(props))){
// If that fails, try to read the vectors from the word2vec file String[] split = line.toLowerCase().split("\\s+");
if(new File(word2vecFile).exists()) { if(split.length < 100) continue;
vectors = VectorMap.readWord2Vec(word2vecFile); float[] vector = new float[split.length-1];
if (wordvectorFile != null && !wordvectorFile.startsWith("edu")) { for(int i=1; i < split.length ; i++) {
vectors.serialize(wordvectorFile); vector[i-1] = Float.parseFloat(split[i]);
} }
} else { ArrayMath.L2normalize(vector);
// If that fails, give up and crash vectors.put(split[0], vector);
throw new RuntimeIOException(e); dimVector = vector.length;
} }

if(wordvectorFile!=null) IOUtils.writeObjectToFile(vectors, wordvectorFile);
} }
dimVector = vectors.entrySet().iterator().next().getValue().length;


// if(Boolean.parseBoolean(props.getProperty("useValDictionary"))) { // if(Boolean.parseBoolean(props.getProperty("useValDictionary"))) {
// System.err.println("LOAD: ValDictionary"); // System.err.println("LOAD: ValDictionary");
Expand Down Expand Up @@ -580,8 +581,8 @@ public Dictionaries(Properties props) throws ClassNotFoundException, IOException
props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
CorefProperties.getSieves(props).contains("CorefDictionaryMatch"), CorefProperties.getSieves(props).contains("CorefDictionaryMatch"),
PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP, PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP,
new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2, new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}), DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1), props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
if(CorefProperties.useSemantics(props)) { if(CorefProperties.useSemantics(props)) {
Expand Down

0 comments on commit 240cfbf

Please sign in to comment.