add ud/feature_map.txt to model generation scripts

stanfordnlp · Oct 21, 2015 · 240cfbf · 240cfbf
1 parent 634b55b
commit 240cfbf
Show file tree

Hide file tree

Showing 59 changed files with 71,333 additions and 74,228 deletions.
diff --git a/data/edu/stanford/nlp/upos/ENUniversalPOS.tsurgeon b/data/edu/stanford/nlp/upos/ENUniversalPOS.tsurgeon
@@ -64,7 +64,7 @@ relabel target AUX
 %relabel target AUX
 
 % VB.* -> AUX (active, case 1)
-VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)
+VP < VP < (/^VB.*$/=target <... {/.*/})
 
 relabel target AUX
 

diff --git a/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java b/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java
@@ -97,7 +97,6 @@ public void testSimpleSentenceJSON() throws IOException {
             "        {\n" +
             "          \"index\": 1,\n" +
             "          \"word\": \"Bad\",\n" +
-            "          \"originalText\": \"Bad\",\n" +
             "          \"lemma\": \"bad\",\n" +
             "          \"characterOffsetBegin\": 0,\n" +
             "          \"characterOffsetEnd\": 3,\n" +
@@ -109,7 +108,6 @@ public void testSimpleSentenceJSON() throws IOException {
             "        {\n" +
             "          \"index\": 2,\n" +
             "          \"word\": \"wolf\",\n" +
-            "          \"originalText\": \"wolf\",\n" +
             "          \"lemma\": \"wolf\",\n" +
             "          \"characterOffsetBegin\": 4,\n" +
             "          \"characterOffsetEnd\": 8,\n" +

diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
diff --git a/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java b/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java
@@ -1,6 +1,6 @@
 package edu.stanford.nlp.pipeline;
 
-import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
 import edu.stanford.nlp.ie.NumberNormalizer;
 import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
 import edu.stanford.nlp.ling.CoreAnnotations;
@@ -405,11 +405,6 @@ public void testRelation() {
     testAnnotators("tokenize,ssplit,pos,lemma,ner,parse,relation");
   }
 
-  @Test
-  public void testUDFeats() {
-    testAnnotators("tokenize,ssplit,pos,depparse,udfeats");
-  }
-
   @Test
   public void testSerializeSSplitTokensRegression() {
     testAnnotators("tokenize,ssplit");
@@ -468,7 +463,7 @@ public void testAllAnnotatorCombinations() {
       if (!annotatorsToConsider.isEmpty()) { continue; }  // continue if we couldn't add all the annotators
 
       // Create pipeline
-      if (!annotators.contains("dcoref") && !annotators.contains("entitymentions")) {  // TODO(gabor) eventually, don't ignore entitymentions!
+      if (!annotators.contains("hcoref") && !annotators.contains("entitymentions")) {  // TODO(gabor) eventually, don't ignore this!
         System.err.println(">>TESTING " + StringUtils.join(annotators, ","));
         testAnnotators(StringUtils.join(annotators, ","));
       }

diff --git a/src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java b/src/edu/stanford/nlp/dcoref/CorefCoreAnnotations.java
@@ -73,7 +73,7 @@ public Class<Integer> getType() {
    * CoreLabel.  Note that the list includes the CoreLabel that was
    * annotated which creates a cycle.
    *
-   * @deprecated This was an original dcoref annotation. You should now use CorefChainAnnotation
+   * @deprecated This was an original dcoref annotation. You should know use CorefChainAnnotation
    */
   @Deprecated
   public static class CorefClusterAnnotation implements CoreAnnotation<Set<CoreLabel>> {

diff --git a/src/edu/stanford/nlp/hcoref/CorefDocMaker.java b/src/edu/stanford/nlp/hcoref/CorefDocMaker.java
@@ -49,10 +49,7 @@ public class CorefDocMaker {
   StanfordCoreNLP corenlp;
   final TreeLemmatizer treeLemmatizer;
   LogisticClassifier<String, String> singletonPredictor;
-  // Should we call corenlp to add missing annotations?
+
-  // HACK so that when the CorefDocMaker is called from annotator, it doesn't override old annotations
-  boolean needMissingAnnotations = true;
-
   public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException {
     this.props = props;
     this.dict = dictionaries;
@@ -65,11 +62,7 @@ public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNo
     singletonPredictor = (CorefProperties.useSingletonPredictor(props))? 
         getSingletonPredictorFromSerializedFile(CorefProperties.getPathSingletonPredictor(props)) : null;
   }
-
+
-  public void setNeedMissingAnnotations(boolean needMissingAnnotations) {
-    this.needMissingAnnotations = needMissingAnnotations;
-  }
-
   /** Load Stanford Processor: skip unnecessary annotator */
   protected StanfordCoreNLP loadStanfordProcessor(Properties props) {
 
@@ -153,16 +146,14 @@ public Document makeDocument(Annotation anno) throws Exception {
   /**
    *  Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)).
    *  Mention detection and document preprocessing is done here.
-   * @throws Exception
+   * @throws Exception 
    */
   public Document makeDocument(InputDoc input) throws Exception {
     if (input == null) return null;
     Annotation anno = input.annotation;
 
     // add missing annotation
-    if (needMissingAnnotations) {
+    addMissingAnnotation(anno);
-      addMissingAnnotation(anno);
-    }
 
     if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
       anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);

diff --git a/src/edu/stanford/nlp/hcoref/CorefProperties.java b/src/edu/stanford/nlp/hcoref/CorefProperties.java
@@ -195,8 +195,8 @@ public static String getPathSingletonPredictor(Properties props) {
     return PropertiesUtils.getString(props, PATH_SINGLETON_PREDICTOR_PROP, "edu/stanford/nlp/models/dcoref/singleton.predictor.ser");
   }
   public static String getPathModel(Properties props, String sievename) {
-    return props.getProperty(PATH_SERIALIZED_PROP) + File.separator +
+    return new File(props.getProperty(PATH_SERIALIZED_PROP),
-        props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename);
+        props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename)).getAbsolutePath();
   }
   public static boolean debug(Properties props) {
     return PropertiesUtils.getBool(props, DEBUG_PROP, false);
@@ -325,7 +325,7 @@ public static boolean useSemantics(Properties props) {
     return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, true);
   }
   public static String getPathSerializedWordVectors(Properties props) {
-    return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser.gz");
+    return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser");
   }
   public static String getCurrentSieveForTrain(Properties props) {
     return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null);

diff --git a/src/edu/stanford/nlp/hcoref/Preprocessor.java b/src/edu/stanford/nlp/hcoref/Preprocessor.java
@@ -29,12 +29,13 @@
 import edu.stanford.nlp.semgraph.SemanticGraph;
 import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
 import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
+import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation;
 import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
 import edu.stanford.nlp.trees.GrammaticalRelation;
 import edu.stanford.nlp.trees.HeadFinder;
 import edu.stanford.nlp.trees.Tree;
 import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
-import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
 import edu.stanford.nlp.util.CollectionValuedMap;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Generics;
@@ -125,7 +126,6 @@ protected static int getHeadIndex(Tree t, HeadFinder headFinder) {
 
   private static List<Mention> mentionReorderingBySpan(List<Mention> mentionsInSent) {
     TreeSet<Mention> ordering = new TreeSet<Mention>(new Comparator<Mention>(){
-      @Override
       public int compare(Mention m1, Mention m2) {
         return (m1.appearEarlierThan(m2))? -1 : (m2.appearEarlierThan(m1))? 1 : 0;
       }
@@ -298,7 +298,7 @@ private static void fillMentionInfo(Document doc, Dictionaries dict,
         m.contextParseTree = sentence.get(TreeAnnotation.class);
 //        m.sentenceWords = sentence.get(TokensAnnotation.class);
         m.basicDependency = sentence.get(BasicDependenciesAnnotation.class);
-        m.collapsedDependency = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
+        m.collapsedDependency = sentence.get(CollapsedDependenciesAnnotation.class);
 
         // mentionSubTree (highest NP that has the same head) if constituency tree available
         if (m.contextParseTree != null) {
@@ -343,7 +343,7 @@ private static void findSyntacticRelationsFromDependency(List<Mention> orderedMe
 
     // apposition
     Set<Pair<Integer, Integer>> appos = Generics.newHashSet();
-    List<SemanticGraphEdge> appositions = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
+    List<SemanticGraphEdge> appositions = dependency.findAllRelns(EnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
     for(SemanticGraphEdge edge : appositions) {
       int sIdx = edge.getSource().index()-1;
       int tIdx = edge.getTarget().index()-1;
@@ -353,18 +353,18 @@ private static void findSyntacticRelationsFromDependency(List<Mention> orderedMe
 
     // predicate nominatives
     Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet();
-    List<SemanticGraphEdge> copula = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.COPULA);
+    List<SemanticGraphEdge> copula = dependency.findAllRelns(EnglishGrammaticalRelations.COPULA);
     for(SemanticGraphEdge edge : copula) {
       IndexedWord source = edge.getSource();
-      IndexedWord target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT);
+      IndexedWord target = dependency.getChildWithReln(source, EnglishGrammaticalRelations.NOMINAL_SUBJECT);
-      if(target==null) target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.CLAUSAL_SUBJECT);
+      if(target==null) target = dependency.getChildWithReln(source, EnglishGrammaticalRelations.CLAUSAL_SUBJECT);
       // TODO
       if(target == null) continue;
 
       // to handle relative clause: e.g., Tim who is a student,
       if(target.tag().startsWith("W")) {
         IndexedWord parent = dependency.getParent(source);
-        if(parent!=null && dependency.reln(parent, source).equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) {
+        if(parent!=null && dependency.reln(parent, source).equals(EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) {
           target = parent;
         }
       }
@@ -412,7 +412,6 @@ private static DocType findDocType(Document doc) {
     if(!speakerChange) return DocType.ARTICLE;
     return DocType.CONVERSATION;  // in conversation, utter index keep increasing.
   }
-
   /** Set paragraph index */
   private static void setParagraphAnnotation(Document doc) {
     int paragraphIndex = 0;
@@ -630,7 +629,6 @@ private static void findSpeakers(Document doc, Dictionaries dict) {
       }
     }
   }
-
   private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
     List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
     IntPair beginQuotation = null;

diff --git a/src/edu/stanford/nlp/hcoref/data/Dictionaries.java b/src/edu/stanford/nlp/hcoref/data/Dictionaries.java
@@ -15,7 +15,7 @@
 import edu.stanford.nlp.hcoref.CorefProperties;
 import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.io.RuntimeIOException;
-import edu.stanford.nlp.neural.VectorMap;
+import edu.stanford.nlp.math.ArrayMath;
 import edu.stanford.nlp.pipeline.DefaultPaths;
 import edu.stanford.nlp.stats.ClassicCounter;
 import edu.stanford.nlp.stats.Counter;
@@ -202,7 +202,7 @@ private void readWordLists(Locale lang) {
 
   public int dimVector;
 
-  public VectorMap vectors = new VectorMap();
+  public Map<String, float[]> vectors = Generics.newHashMap();
 
   public Map<String, String> strToEntity = Generics.newHashMap();
   public Counter<String> dictScore = new ClassicCounter<String>();
@@ -535,23 +535,24 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc
     if(CorefProperties.loadWordEmbedding(props)) {
       System.err.println("LOAD: WordVectors");
       String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props);
-      String word2vecFile = CorefProperties.getPathWord2Vec(props);
+      if(new File(wordvectorFile).exists()) {
-      try {
+        vectors = IOUtils.readObjectFromFile(wordvectorFile);
-        // Try to read the serialized vectors
+        dimVector = vectors.entrySet().iterator().next().getValue().length;
-        vectors = VectorMap.deserialize(wordvectorFile);
+      } else {
-      } catch (IOException e) {
+        for(String line : IOUtils.readLines(CorefProperties.getPathWord2Vec(props))){
-        // If that fails, try to read the vectors from the word2vec file
+          String[] split = line.toLowerCase().split("\\s+");
-        if(new File(word2vecFile).exists()) {
+          if(split.length < 100) continue;
-          vectors = VectorMap.readWord2Vec(word2vecFile);
+          float[] vector = new float[split.length-1];
-          if (wordvectorFile != null && !wordvectorFile.startsWith("edu")) {
+          for(int i=1; i < split.length ; i++) {
-            vectors.serialize(wordvectorFile);
+            vector[i-1] = Float.parseFloat(split[i]);
           }
-        } else {
+          ArrayMath.L2normalize(vector);
-          // If that fails, give up and crash
+          vectors.put(split[0], vector);
-          throw new RuntimeIOException(e);
+          dimVector = vector.length;
         }
+
+        if(wordvectorFile!=null) IOUtils.writeObjectToFile(vectors, wordvectorFile);
       }
-      dimVector = vectors.entrySet().iterator().next().getValue().length;
 
 //    if(Boolean.parseBoolean(props.getProperty("useValDictionary"))) {
 //      System.err.println("LOAD: ValDictionary");
@@ -580,8 +581,8 @@ public Dictionaries(Properties props) throws ClassNotFoundException, IOException
         props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
         CorefProperties.getSieves(props).contains("CorefDictionaryMatch"),
         PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP,
-            new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
+                                       new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
-                DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
+                                                    DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
         props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
         props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
     if(CorefProperties.useSemantics(props)) {