Improve CoNLL-U document reader, allow additional dependencies and co…

…mments, add test.
stanfordnlp · Sep 1, 2015 · cf31ad0 · cf31ad0
1 parent 3d60367
commit cf31ad0
Show file tree

Hide file tree

Showing 3 changed files with 253 additions and 36 deletions.
diff --git a/src/edu/stanford/nlp/ling/CoreAnnotations.java b/src/edu/stanford/nlp/ling/CoreAnnotations.java
@@ -447,13 +447,22 @@ public Class<String> getType() {
       return String.class;
     }
   }
+
+  /**
+   * CoNLL-U dep parsing - span of multiword tokens
+   */
+  public static class CoNLLUTokenSpanAnnotation implements CoreAnnotation<Pair<Integer,Integer>> {
+    public Class<Pair<Integer,Integer>> getType() {
+      return ErasureUtils.<Class<Pair<Integer,Integer>>> uncheckedCast(Pair.class);
+    }
+  }
 
   /**
    * CoNLL-U dep parsing - List of secondary dependencies
    */
-  public static class CoNLLUSecondaryDepsAnnotation implements CoreAnnotation<String> {
-    public Class<String> getType() {
-      return String.class;
+  public static class CoNLLUSecondaryDepsAnnotation implements CoreAnnotation<HashMap<Integer,String>> {
+    public Class<HashMap<Integer,String>> getType() {
+      return ErasureUtils.<Class<HashMap<Integer,String>>> uncheckedCast(Pair.class);
     }
   }
 

diff --git a/src/edu/stanford/nlp/trees/CoNLLUDocumentReader.java b/src/edu/stanford/nlp/trees/CoNLLUDocumentReader.java
@@ -11,6 +11,7 @@
 import java.util.function.Function;
 
 import edu.stanford.nlp.international.Language;
+import edu.stanford.nlp.ling.CoreAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.IndexedWord;
 import edu.stanford.nlp.objectbank.DelimitRegExIterator;
@@ -19,14 +20,13 @@
 import edu.stanford.nlp.semgraph.SemanticGraph;
 import edu.stanford.nlp.trees.GrammaticalRelation;
 import edu.stanford.nlp.trees.TypedDependency;
+import edu.stanford.nlp.util.Pair;
 
 /**
  * Reader for ConLL-U formatted dependency treebanks.
  *
  * @author Sebastian Schuster
  */
-
-
 public class CoNLLUDocumentReader implements
     IteratorFromReaderFactory<SemanticGraph> {
 
@@ -43,37 +43,82 @@ public Iterator<SemanticGraph> getIterator(Reader r) {
     return ifrf.getIterator(r);
   }
 
+
+  private static final Comparator<IndexedWord> byIndex = (i1, i2) -> i1.compareTo(i2);
+
+  /* Comparator for putting multiword tokens before regular tokens.  */
+  private static final Comparator<IndexedWord> byType = (i1, i2) ->
+          i1.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? -1 :
+                  i2.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? 1 : 0;
+
   private static class SentenceProcessor implements Function<String,SemanticGraph> {
 
     private int lineNumberCounter = 0;
 
     public SemanticGraph apply(String line) {
       if (line == null) return null;
+
       Function<String,IndexedWord> func = new WordProcessor();
       ObjectBank<IndexedWord> words = ObjectBank.getLineIterator(new StringReader(line), func);
-      List<IndexedWord> sorted = new ArrayList<IndexedWord>(words);
-      Collections.sort(sorted);
 
+      List<IndexedWord> wordList = new ArrayList<>(words);
+
+      List<IndexedWord> sorted = new ArrayList<>(wordList.size());
+      wordList.stream().filter(w -> w != IndexedWord.NO_WORD)
+              .sorted(byIndex.thenComparing(byType))
+              .forEach(w -> sorted.add(w));
+
+      List<IndexedWord> sortedTokens = new ArrayList<>(wordList.size());
+      sorted.stream()
+              .filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
+              .forEach(w -> sortedTokens.add(w));
 
       /* Construct a semantic graph. */
-      List<TypedDependency> deps = new ArrayList<TypedDependency>(sorted.size());
+      List<TypedDependency> deps = new ArrayList<>(sorted.size());
+
+      Pair<Integer,Integer> tokenSpan = null;
+      String originalToken = null;
       for (IndexedWord word : sorted) {
         lineNumberCounter++;
-        GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish, word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
-        int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
-        IndexedWord gov;
-        if (govIdx == 0) {
-          gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
-          gov.setValue("ROOT");
-          if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
-            reln = GrammaticalRelation.ROOT;
-          }
+
+        if (word.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
+          tokenSpan = word.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
+          originalToken = word.word();
         } else {
-          gov = sorted.get(govIdx - 1);
+          /* Deal with multiword tokens. */
+          if (tokenSpan != null && tokenSpan.second >= word.index()) {
+            word.setOriginalText(originalToken);
+            word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, tokenSpan);
+          } else {
+            tokenSpan = null;
+            originalToken = null;
+          }
+          GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish,
+                  word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
+          int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
+          IndexedWord gov;
+          if (govIdx == 0) {
+            gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
+            gov.setValue("ROOT");
+            if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
+              reln = GrammaticalRelation.ROOT;
+            }
+          } else {
+            gov = sortedTokens.get(govIdx - 1);
+          }
+          TypedDependency dep = new TypedDependency(reln, gov, word);
+          word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
+          deps.add(dep);
+
+          HashMap<Integer,String> extraDeps = word.get(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class);
+          for (Integer extraGovIdx : extraDeps.keySet()) {
+            GrammaticalRelation extraReln = GrammaticalRelation.valueOf(Language.UniversalEnglish, extraDeps.get(extraGovIdx));
+            IndexedWord extraGov =  sortedTokens.get(extraGovIdx - 1);
+            TypedDependency extraDep = new TypedDependency(extraReln, extraGov, word);
+            extraDep.setExtra();
+            deps.add(extraDep);
+          }
         }
-        TypedDependency dep = new TypedDependency(reln, gov, word);
-        word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
-        deps.add(dep);
       }
       lineNumberCounter++;
 
@@ -83,29 +128,49 @@ public SemanticGraph apply(String line) {
 
   private static class WordProcessor implements Function<String,IndexedWord> {
     public IndexedWord apply(String line) {
+
+      /* Comments.
+       * TODO[sebschu]: Save them somewhere such that they can be output again.
+       */
+      if (line.startsWith("#")) {
+        return IndexedWord.NO_WORD;
+      }
+
       String[] bits = line.split("\\s+");
+
       IndexedWord word = new IndexedWord();
-      word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
       word.set(CoreAnnotations.TextAnnotation.class, bits[1]);
-      word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
-      word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
-      word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);
 
-      word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
-      word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
-      word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, bits[8]);
-      word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
+      /* Check if it is a multiword token. */
+      if (bits[0].contains("-")) {
+        String[] span = bits[0].split("-");
+        Integer start = Integer.parseInt(span[0]);
+        Integer end = Integer.parseInt(span[1]);
+        word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, new Pair<>(start, end));
+        word.set(CoreAnnotations.IndexAnnotation.class, start);
+      } else {
+        word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
+        word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
+        word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
+        word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);
 
-      word.setIndex(Integer.parseInt(bits[0]));
-      word.setValue(bits[1]);
+        word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
+        word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
+        word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
 
-      /* Parse features. */
-      HashMap<String, String> features = parseFeatures(bits[5]);
+        word.setIndex(Integer.parseInt(bits[0]));
+        word.setValue(bits[1]);
 
-      word.set(CoreAnnotations.CoNLLUFeats.class, features);
+        /* Parse features. */
+        HashMap<String, String> features = parseFeatures(bits[5]);
+        word.set(CoreAnnotations.CoNLLUFeats.class, features);
 
+        /* Parse extra dependencies. */
+        HashMap<Integer,String> extraDeps = parseExtraDeps(bits[8]);
+        word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
+      }
 
-      return word;
+    return word;
     }
   }
 
@@ -136,7 +201,6 @@ public static HashMap<String,String> parseFeatures(String featureString) {
    *
    * @return The feature string.
    */
-
   public static String toFeatureString(HashMap<String,String> features) {
     StringBuffer sb = new StringBuffer();
     boolean first = true;
@@ -163,6 +227,60 @@ public static String toFeatureString(HashMap<String,String> features) {
     return sb.toString();
   }
 
+  /**
+   * Parses the value of the extra dependencies column in a CoNLL-U file
+   * and returns them in a HashMap with the governor indices as keys
+   * and the relation names as values.
+   *
+   * @param extraDepsString
+   * @return A HashMap<Integer,String> with the additional dependencies.
+   */
+  public static HashMap<Integer,String> parseExtraDeps(String extraDepsString) {
+    HashMap<Integer,String> extraDeps = new HashMap<>();
+    if ( ! extraDepsString.equals("_")) {
+      String[] extraDepParts = extraDepsString.split("\\|");
+      for (String extraDepString : extraDepParts) {
+        int sepPos = extraDepString.lastIndexOf(":");
+        String reln = extraDepString.substring(sepPos + 1);
+        Integer gov = Integer.parseInt(extraDepString.substring(0, sepPos));
+        extraDeps.put(gov, reln);
+      }
+    }
+    return extraDeps;
+  }
+
+  /**
+   * Converts an extra dependencies hash map to a string to be used
+   * in a CoNLL-U file.
+   *
+   * @param extraDeps
+   * @return The extra dependencies string.
+   */
+  public static String toExtraDepsString(HashMap<Integer,String> extraDeps) {
+    StringBuffer sb = new StringBuffer();
+    boolean first = true;
+    List<Integer> sortedKeys = new ArrayList<>(extraDeps.keySet());
+    Collections.sort(sortedKeys);
+    for (Integer key : sortedKeys) {
+      if ( ! first) {
+        sb.append("|");
+      } else {
+        first = false;
+      }
+
+      sb.append(key)
+              .append(":")
+              .append(extraDeps.get(key));
+    }
+
+    /* Empty feature list. */
+    if (first) {
+      sb.append("_");
+    }
+    return sb.toString();
+  }
+
+
   public static class FeatureNameComparator implements Comparator<String> {
 
     @Override

diff --git a/test/src/edu/stanford/nlp/trees/CoNLLUDocumentReaderTest.java b/test/src/edu/stanford/nlp/trees/CoNLLUDocumentReaderTest.java
@@ -0,0 +1,90 @@
+package edu.stanford.nlp.trees;
+
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import junit.framework.TestCase;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Iterator;
+
+/**
+ * @author Sebastian Schuster
+ */
+public class CoNLLUDocumentReaderITest extends TestCase {
+
+    private static String MULTIWORD_TEST_INPUT =
+            "1     I         I      PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _\n" +
+            "2-3   haven't   _      _       _     _                                 _   _   _   _\n" +
+            "2     have      have   VERB    VBP    Number=Sing|Person=1|Tense=Pres   0   root   _   _\n" +
+            "3     not       not    PART    RB    Negative=Neg                      2   neg   _   _\n" +
+            "4     a         a      DET     DT    Definite=Ind|PronType=Art         5   det   _   _\n" +
+            "5     clue      clue   NOUN    NN    Number=Sing                       2   dobj   _   _\n" +
+            "6     .         .      PUNCT   .     _                                 2   punct   _   _\n\n";
+
+    private static String COMMENT_TEST_INPUT =
+            "#comment line 1\n" +
+            "#comment line 2\n" +
+            "1     I         I      PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _\n" +
+            "2     have      have   VERB    VBP    Number=Sing|Person=1|Tense=Pres   0   root   _   _\n" +
+            "3     not       not    PART    RB    Negative=Neg                      2   neg   _   _\n" +
+            "4     a         a      DET     DT    Definite=Ind|PronType=Art         5   det   _   _\n" +
+            "5     clue      clue   NOUN    NN    Number=Sing                       2   dobj   _   _\n" +
+            "6     .         .      PUNCT   .     _                                 2   punct   _   _\n\n";
+
+    private static String EXTRA_DEPS_TEST_INPUT =
+            "1     They       They       PRON    PRP    _    2   nsubj   4:nsubj         _\n" +
+            "2     buy        buy        VERB    VBP    _    0   root    _               _\n" +
+            "3     and        and        CONJ    CC     _    2   cc      _               _\n" +
+            "4     sell       sell       VERB    VBP    _    5   conj    _               _\n" +
+            "5     books      book       NOUN    NNS    _    2   dobj    4:dobj          _\n" +
+            "6     ,          ,          PUNCT   ,      _    5   punct   _               _\n" +
+            "7     newspapers newspaper  NOUN    NNS    _    5   conj    2:dobj|4:dobj   _\n" +
+            "8     and        and        CONJ    CC     _    5   cc      _               _\n" +
+            "9     magazines  magazine   NOUN    NNS    _    5   conj    2:dobj|4:dobj   _\n" +
+            "10    .          .          PUNCT   .      _    2   punct   _               _\n\n";
+
+
+    public void testMultiWords() {
+        CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
+        Reader stringReader = new StringReader(MULTIWORD_TEST_INPUT);
+        Iterator<SemanticGraph> it = reader.getIterator(stringReader);
+
+        SemanticGraph sg = it.next();
+        assertNotNull(sg);
+        assertFalse("The input only contains one dependency tree.", it.hasNext());
+        assertEquals("[have/VBP nsubj>I/PRP neg>not/RB dobj>[clue/NN det>a/DT] punct>./.]", sg.toCompactString(true));
+
+        for (IndexedWord iw : sg.vertexListSorted()) {
+            if (iw.index() != 2 && iw.index() != 3) {
+                assertEquals("", iw.originalText());
+            } else {
+                assertEquals("haven't", iw.originalText());
+            }
+        }
+    }
+
+    public void testComment() {
+        CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
+        Reader stringReader = new StringReader(COMMENT_TEST_INPUT);
+        Iterator<SemanticGraph> it = reader.getIterator(stringReader);
+
+        SemanticGraph sg = it.next();
+        assertNotNull(sg);
+        assertFalse("The input only contains one dependency tree.", it.hasNext());
+        assertEquals("[have/VBP nsubj>I/PRP neg>not/RB dobj>[clue/NN det>a/DT] punct>./.]", sg.toCompactString(true));
+    }
+
+    public void testExtraDependencies() {
+        CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
+        Reader stringReader = new StringReader(EXTRA_DEPS_TEST_INPUT);
+        Iterator<SemanticGraph> it = reader.getIterator(stringReader);
+
+        SemanticGraph sg = it.next();
+        assertNotNull(sg);
+        assertFalse("The input only contains one dependency tree.", it.hasNext());
+        assertTrue(sg.containsEdge(sg.getNodeByIndex(4), sg.getNodeByIndex(1)));
+        assertTrue(sg.containsEdge(sg.getNodeByIndex(2), sg.getNodeByIndex(7)));
+        assertTrue(sg.containsEdge(sg.getNodeByIndex(4), sg.getNodeByIndex(7)));
+    }
+}