Create custom annotator for UD features.

stanfordnlp · Oct 16, 2015 · b8a9bbe · b8a9bbe
1 parent 6e117da
commit b8a9bbe
Show file tree

Hide file tree

Showing 11 changed files with 208 additions and 121 deletions.
diff --git a/src/edu/stanford/nlp/hcoref/properties/coref-default-dep.properties b/src/edu/stanford/nlp/hcoref/properties/coref-default-dep.properties
@@ -16,13 +16,11 @@ hcoref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/anno
 # data & model path
 
 ## models
-#hcoref.path.serialized = /scr/nlp/data/coref/models/stanford/
+hcoref.path.serialized = /scr/nlp/data/coref/models/stanford/
-hcoref.path.serialized = edu/stanford/nlp/models/hcoref/
 
 ## other data
 hcoref.big.gender.number = edu/stanford/nlp/models/dcoref/gender.data.gz
-#hcoref.path.word2vec = /scr/nlp/data/coref/wordvectors/en/vectors.txt.gz
+hcoref.path.word2vec = /scr/nlp/data/coref/wordvectors/en/vectors.txt.gz 
-hcoref.path.word2vec = edu/stanford/nlp/models/word2vec/vectors.txt.gz
 
 #############################################################################
 # mention detection

diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -108,6 +108,7 @@ public String toString() {
   String STANFORD_NATLOG = "natlog";
   String STANFORD_OPENIE = "openie";
   String STANFORD_QUOTE = "quote";
+  String STANFORD_UD_FEATURES = "udfeats";
 
   Requirement TOKENIZE_REQUIREMENT = new Requirement(STANFORD_TOKENIZE);
   Requirement CLEAN_XML_REQUIREMENT = new Requirement(STANFORD_CLEAN_XML);
@@ -125,97 +126,105 @@ public String toString() {
   Requirement NATLOG_REQUIREMENT = new Requirement(STANFORD_NATLOG);
   Requirement OPENIE_REQUIREMENT = new Requirement(STANFORD_OPENIE);
   Requirement QUOTE_REQUIREMENT = new Requirement(STANFORD_QUOTE);
+  Requirement UD_FEATURES_REQUIREMENT = new Requirement(STANFORD_UD_FEATURES);
 
   /**
    * A map from annotator name to a set of requirements for that annotator.
    * This is useful to have here for the purpose of static analysis on an
    * annotators list.
    */
   @SuppressWarnings("unchecked")
-  Map<String, Set<Requirement>> REQUIREMENTS = Collections.unmodifiableMap(new HashMap<String, Set<Requirement>>() {{
+  Map<String, Set<Requirement>> REQUIREMENTS = Collections.unmodifiableMap(new HashMap<String, Set<Requirement>>() {
-    put(STANFORD_TOKENIZE, Collections.EMPTY_SET);
+    {
-    put(STANFORD_CLEAN_XML, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_TOKENIZE, Collections.EMPTY_SET);
+      put(STANFORD_CLEAN_XML, Collections.unmodifiableSet(new HashSet<Requirement>() {{
         add(TOKENIZE_REQUIREMENT);  // A requirement for STANFORD_CLEAN_XML
-    }}));
+      }}));
-    put(STANFORD_SSPLIT, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_SSPLIT, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_POS, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_POS, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_LEMMA, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_LEMMA, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_NER, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_NER, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_GENDER, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_GENDER, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_TRUECASE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_TRUECASE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_PARSE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_PARSE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_DEPENDENCIES, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_DEPENDENCIES, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_DETERMINISTIC_COREF, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_DETERMINISTIC_COREF, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-      add(NER_REQUIREMENT);
+        add(NER_REQUIREMENT);
-      add(PARSE_REQUIREMENT);
+        add(PARSE_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_COREF, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_COREF, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-      add(NER_REQUIREMENT);
+        add(NER_REQUIREMENT);
-      add(PARSE_REQUIREMENT);
+        add(PARSE_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_RELATION, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_RELATION, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-      add(NER_REQUIREMENT);
+        add(NER_REQUIREMENT);
-      add(DEPENDENCY_REQUIREMENT);
+        add(DEPENDENCY_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_NATLOG, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_NATLOG, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(LEMMA_REQUIREMENT);
+        add(LEMMA_REQUIREMENT);
-      add(DEPENDENCY_REQUIREMENT);  // TODO(gabor) can also use 'parse' annotator, technically
+        add(DEPENDENCY_REQUIREMENT);  // TODO(gabor) can also use 'parse' annotator, technically
-    }}));
+      }}));
-    put(STANFORD_OPENIE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_OPENIE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      add(TOKENIZE_REQUIREMENT);
+        add(TOKENIZE_REQUIREMENT);
-      add(SSPLIT_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
-      add(POS_REQUIREMENT);
+        add(POS_REQUIREMENT);
-      add(DEPENDENCY_REQUIREMENT);  // TODO(gabor) can also use 'parse' annotator, technically
+        add(DEPENDENCY_REQUIREMENT);  // TODO(gabor) can also use 'parse' annotator, technically
-      add(NATLOG_REQUIREMENT);
+        add(NATLOG_REQUIREMENT);
-    }}));
+      }}));
-    put(STANFORD_QUOTE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
+      put(STANFORD_QUOTE, Collections.unmodifiableSet(new HashSet<Requirement>() {{
-      // No requirements
+        // No requirements
-    }}));
+      }}));
+      put(STANFORD_UD_FEATURES, Collections.unmodifiableSet(new HashSet<Requirement>(){{
+        add(TOKENIZE_REQUIREMENT);
+        add(SSPLIT_REQUIREMENT);
+        add(POS_REQUIREMENT);
+        add(DEPENDENCY_REQUIREMENT);
+      }}));
   }});
 
   /**
@@ -252,5 +261,7 @@ public String toString() {
   Set<Requirement> TOKENIZE_SSPLIT_POS_DEPPARSE = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT, DEPENDENCY_REQUIREMENT));
   Set<Requirement> PARSE_AND_TAG = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT));
   Set<Requirement> PARSE_TAG_BINARIZED_TREES = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT, BINARIZED_TREES_REQUIREMENT));
+  Set<Requirement> PARSE_TAG_DEPPARSE_BINARIZED_TREES = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT, DEPENDENCY_REQUIREMENT, BINARIZED_TREES_REQUIREMENT));
+  Set<Requirement> PARSE_TAG_DEPPARSE = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT, DEPENDENCY_REQUIREMENT));
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/AnnotatorFactories.java b/src/edu/stanford/nlp/pipeline/AnnotatorFactories.java
@@ -651,4 +651,24 @@ protected String additionalSignature() {
     };
   }
 
+
+  //
+  // UD Features Extractor
+  //
+  public static AnnotatorFactory udfeats(Properties properties, final AnnotatorImplementations annotatorImpl) {
+    return new AnnotatorFactory(properties, annotatorImpl) {
+      private static final long serialVersionUID = -2525567112379296672L;
+
+      @Override
+      public Annotator create() {
+                return annotatorImpl.udfeats(properties);
+            }
+
+      @Override
+      protected String additionalSignature() {
+                return "";
+            }
+    };
+  }
+
 }
diff --git a/src/edu/stanford/nlp/pipeline/AnnotatorImplementations.java b/src/edu/stanford/nlp/pipeline/AnnotatorImplementations.java
@@ -263,4 +263,11 @@ public Annotator quote(Properties properties) {
     return new QuoteAnnotator(relevantProperties);
   }
 
+  /**
+   * Add universal dependencies features
+   */
+  public Annotator udfeats(Properties properties) {
+    return new UDFeatureAnnotator();
+  }
+
 }
diff --git a/src/edu/stanford/nlp/pipeline/CharniakParserAnnotator.java b/src/edu/stanford/nlp/pipeline/CharniakParserAnnotator.java
@@ -69,7 +69,7 @@ public void annotate(Annotation annotation) {
 
         List<Tree> trees = Generics.newArrayList(1);
         trees.add(tree);
-        ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, GrammaticalStructure.Extras.NONE, null);
+        ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, GrammaticalStructure.Extras.NONE);
       }
     } else {
       throw new RuntimeException("unable to find sentences in: " + annotation);

diff --git a/src/edu/stanford/nlp/pipeline/JSONOutputter.java b/src/edu/stanford/nlp/pipeline/JSONOutputter.java
@@ -128,7 +128,6 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
               // Add a single token
               l3.set("index", token.index());
               l3.set("word", token.word());
-              l3.set("originalText", token.originalText());
               l3.set("lemma", token.lemma());
               l3.set("characterOffsetBegin", token.beginPosition());
               l3.set("characterOffsetEnd", token.endPosition());

diff --git a/src/edu/stanford/nlp/pipeline/ParserAnnotator.java b/src/edu/stanford/nlp/pipeline/ParserAnnotator.java
@@ -63,8 +63,6 @@ public class ParserAnnotator extends SentenceAnnotator {
 
   private final boolean keepPunct;
 
-  private UniversalDependenciesFeatureAnnotator featureAnnotator = null;
-
   /** If true, don't re-annotate sentences that already have a tree annotation */
   private final boolean noSquash;
   private final GrammaticalStructure.Extras extraDependencies;
@@ -96,13 +94,6 @@ public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Funct
     if (this.BUILD_GRAPHS) {
       TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
       this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder());
-      if (this.gsf instanceof UniversalEnglishGrammaticalStructureFactory) {
-        try {
-          this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
-        } catch (IOException e) {
-          //do nothing
-        }
-      }
     } else {
       this.gsf = null;
     }
@@ -155,13 +146,6 @@ public ParserAnnotator(String annotatorName, Properties props) {
       TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
       Predicate<String> punctFilter = this.keepPunct ? Filters.acceptFilter() : tlp.punctuationWordRejectFilter();
       this.gsf = tlp.grammaticalStructureFactory(punctFilter, parser.getTLPParams().typedDependencyHeadFinder());
-      if (this.gsf instanceof UniversalEnglishGrammaticalStructureFactory) {
-        try {
-          this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
-        } catch (IOException e) {
-          //do nothing
-        }
-      }
     } else {
       this.gsf = null;
     }
@@ -302,7 +286,7 @@ private void finishSentence(CoreMap sentence, List<Tree> trees) {
       trees = mappedTrees;
     }
 
-    ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, extraDependencies, featureAnnotator);
+    ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, extraDependencies);
 
     if (saveBinaryTrees) {
       TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
@@ -352,10 +336,18 @@ public Set<Requirement> requires() {
 
   @Override
   public Set<Requirement> requirementsSatisfied() {
-    if (this.saveBinaryTrees) {
+    if (this.BUILD_GRAPHS) {
-      return PARSE_TAG_BINARIZED_TREES;
+      if (this.saveBinaryTrees) {
+        return PARSE_TAG_DEPPARSE_BINARIZED_TREES;
+      } else {
+        return PARSE_TAG_DEPPARSE;
+      }
     } else {
-      return PARSE_AND_TAG;
+      if (this.saveBinaryTrees) {
+        return PARSE_TAG_BINARIZED_TREES;
+      } else {
+        return PARSE_AND_TAG;
+      }
     }
   }
 }
diff --git a/src/edu/stanford/nlp/pipeline/ParserAnnotatorUtils.java b/src/edu/stanford/nlp/pipeline/ParserAnnotatorUtils.java
@@ -31,8 +31,7 @@ private ParserAnnotatorUtils() {} // static methods
    */
   public static void fillInParseAnnotations(boolean verbose, boolean buildGraphs,
                                             GrammaticalStructureFactory gsf, CoreMap sentence,
-                                            List<Tree> trees, GrammaticalStructure.Extras extras,
+                                            List<Tree> trees, GrammaticalStructure.Extras extras) {
-                                            UniversalDependenciesFeatureAnnotator featureAnnotator) {
 
     boolean first = true;
     for (Tree tree : trees) {
@@ -63,11 +62,6 @@ public static void fillInParseAnnotations(boolean verbose, boolean buildGraphs,
           SemanticGraph uncollapsedDeps = SemanticGraphFactory.generateUncollapsedDependencies(gsf.newGrammaticalStructure(tree), extras);
           SemanticGraph ccDeps = SemanticGraphFactory.generateCCProcessedDependencies(gsf.newGrammaticalStructure(tree), extras);
 
-          // add features to graphs if we are converting to English UD
-          if (featureAnnotator != null) {
-            featureAnnotator.addFeatures(deps, tree, false, true);
-          }
-
           if (verbose) {
             System.err.println("SDs:");
             System.err.println(deps.toString(SemanticGraph.OutputFormat.LIST));

diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -415,11 +415,13 @@ protected synchronized AnnotatorPool getDefaultAnnotatorPool(final Properties in
     pool.register(STANFORD_COREF, AnnotatorFactories.coref(properties, annotatorImplementation));
     pool.register(STANFORD_RELATION, AnnotatorFactories.relation(properties, annotatorImplementation));
     pool.register(STANFORD_SENTIMENT, AnnotatorFactories.sentiment(properties, annotatorImplementation));
-    pool.register(STANFORD_COLUMN_DATA_CLASSIFIER,AnnotatorFactories.columnDataClassifier(properties,annotatorImplementation));
+    pool.register(STANFORD_COLUMN_DATA_CLASSIFIER,AnnotatorFactories.columnDataClassifier(properties, annotatorImplementation));
     pool.register(STANFORD_DEPENDENCIES, AnnotatorFactories.dependencies(properties, annotatorImplementation));
     pool.register(STANFORD_NATLOG, AnnotatorFactories.natlog(properties, annotatorImplementation));
     pool.register(STANFORD_OPENIE, AnnotatorFactories.openie(properties, annotatorImplementation));
     pool.register(STANFORD_QUOTE, AnnotatorFactories.quote(properties, annotatorImplementation));
+    pool.register(STANFORD_UD_FEATURES, AnnotatorFactories.udfeats(properties, annotatorImplementation));
+
     // Add more annotators here
 
     // add annotators loaded via reflection from classnames specified

diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
@@ -98,14 +98,14 @@ private static Map<String, String> getURLParams(URI uri) throws UnsupportedEncod
 
       String query = uri.getQuery();
       String[] queryFields = query
-          .replaceAll("\\\\&", "___AMP___")
+          .replace("\\&", "___AMP___")
-          .replaceAll("\\\\+", "___PLUS___")
+          .replace("\\+", "___PLUS___")
           .split("&");
       for (String queryField : queryFields) {
         int firstEq = queryField.indexOf('=');
         // Convention uses "+" for spaces.
-        String key = URLDecoder.decode(queryField.substring(0, firstEq), "utf8").replaceAll("___AMP___", "&").replaceAll("___PLUS___", "+");
+        String key = URLDecoder.decode(queryField.substring(0, firstEq), "utf8").replace("___AMP___", "&").replace("___PLUS___", "+");
-        String value = URLDecoder.decode(queryField.substring(firstEq + 1), "utf8").replaceAll("___AMP___", "&").replaceAll("___PLUS___", "+");
+        String value = URLDecoder.decode(queryField.substring(firstEq + 1), "utf8").replace("___AMP___", "&").replace("___PLUS___", "+");
         urlParams.put(key, value);
       }
       return urlParams;