Skip to content

Commit

Permalink
Improve CoNLL-U document reader, allow additional dependencies and co…
Browse files Browse the repository at this point in the history
…mments, add test.
  • Loading branch information
sebschu authored and Stanford NLP committed Sep 1, 2015
1 parent 3d60367 commit cf31ad0
Show file tree
Hide file tree
Showing 3 changed files with 253 additions and 36 deletions.
15 changes: 12 additions & 3 deletions src/edu/stanford/nlp/ling/CoreAnnotations.java
Expand Up @@ -447,13 +447,22 @@ public Class<String> getType() {
return String.class;
}
}

/**
* CoNLL-U dep parsing - span of multiword tokens
*/
public static class CoNLLUTokenSpanAnnotation implements CoreAnnotation<Pair<Integer,Integer>> {
public Class<Pair<Integer,Integer>> getType() {
return ErasureUtils.<Class<Pair<Integer,Integer>>> uncheckedCast(Pair.class);
}
}

/**
* CoNLL-U dep parsing - List of secondary dependencies
*/
public static class CoNLLUSecondaryDepsAnnotation implements CoreAnnotation<String> {
public Class<String> getType() {
return String.class;
public static class CoNLLUSecondaryDepsAnnotation implements CoreAnnotation<HashMap<Integer,String>> {
public Class<HashMap<Integer,String>> getType() {
return ErasureUtils.<Class<HashMap<Integer,String>>> uncheckedCast(Pair.class);
}
}

Expand Down
184 changes: 151 additions & 33 deletions src/edu/stanford/nlp/trees/CoNLLUDocumentReader.java
Expand Up @@ -11,6 +11,7 @@
import java.util.function.Function;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
Expand All @@ -19,14 +20,13 @@
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.Pair;

/**
* Reader for ConLL-U formatted dependency treebanks.
*
* @author Sebastian Schuster
*/


public class CoNLLUDocumentReader implements
IteratorFromReaderFactory<SemanticGraph> {

Expand All @@ -43,37 +43,82 @@ public Iterator<SemanticGraph> getIterator(Reader r) {
return ifrf.getIterator(r);
}


private static final Comparator<IndexedWord> byIndex = (i1, i2) -> i1.compareTo(i2);

/* Comparator for putting multiword tokens before regular tokens. */
private static final Comparator<IndexedWord> byType = (i1, i2) ->
i1.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? -1 :
i2.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class) ? 1 : 0;

private static class SentenceProcessor implements Function<String,SemanticGraph> {

private int lineNumberCounter = 0;

public SemanticGraph apply(String line) {
if (line == null) return null;

Function<String,IndexedWord> func = new WordProcessor();
ObjectBank<IndexedWord> words = ObjectBank.getLineIterator(new StringReader(line), func);
List<IndexedWord> sorted = new ArrayList<IndexedWord>(words);
Collections.sort(sorted);

List<IndexedWord> wordList = new ArrayList<>(words);

List<IndexedWord> sorted = new ArrayList<>(wordList.size());
wordList.stream().filter(w -> w != IndexedWord.NO_WORD)
.sorted(byIndex.thenComparing(byType))
.forEach(w -> sorted.add(w));

List<IndexedWord> sortedTokens = new ArrayList<>(wordList.size());
sorted.stream()
.filter(w -> !w.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class))
.forEach(w -> sortedTokens.add(w));

/* Construct a semantic graph. */
List<TypedDependency> deps = new ArrayList<TypedDependency>(sorted.size());
List<TypedDependency> deps = new ArrayList<>(sorted.size());

Pair<Integer,Integer> tokenSpan = null;
String originalToken = null;
for (IndexedWord word : sorted) {
lineNumberCounter++;
GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish, word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
IndexedWord gov;
if (govIdx == 0) {
gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
gov.setValue("ROOT");
if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
reln = GrammaticalRelation.ROOT;
}

if (word.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
tokenSpan = word.get(CoreAnnotations.CoNLLUTokenSpanAnnotation.class);
originalToken = word.word();
} else {
gov = sorted.get(govIdx - 1);
/* Deal with multiword tokens. */
if (tokenSpan != null && tokenSpan.second >= word.index()) {
word.setOriginalText(originalToken);
word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, tokenSpan);
} else {
tokenSpan = null;
originalToken = null;
}
GrammaticalRelation reln = GrammaticalRelation.valueOf(Language.UniversalEnglish,
word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class));
int govIdx = word.get(CoreAnnotations.CoNLLDepParentIndexAnnotation.class);
IndexedWord gov;
if (govIdx == 0) {
gov = new IndexedWord(word.docID(), word.sentIndex(), 0);
gov.setValue("ROOT");
if (word.get(CoreAnnotations.CoNLLDepTypeAnnotation.class).equals("root")) {
reln = GrammaticalRelation.ROOT;
}
} else {
gov = sortedTokens.get(govIdx - 1);
}
TypedDependency dep = new TypedDependency(reln, gov, word);
word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
deps.add(dep);

HashMap<Integer,String> extraDeps = word.get(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class);
for (Integer extraGovIdx : extraDeps.keySet()) {
GrammaticalRelation extraReln = GrammaticalRelation.valueOf(Language.UniversalEnglish, extraDeps.get(extraGovIdx));
IndexedWord extraGov = sortedTokens.get(extraGovIdx - 1);
TypedDependency extraDep = new TypedDependency(extraReln, extraGov, word);
extraDep.setExtra();
deps.add(extraDep);
}
}
TypedDependency dep = new TypedDependency(reln, gov, word);
word.set(CoreAnnotations.LineNumberAnnotation.class, lineNumberCounter);
deps.add(dep);
}
lineNumberCounter++;

Expand All @@ -83,29 +128,49 @@ public SemanticGraph apply(String line) {

private static class WordProcessor implements Function<String,IndexedWord> {
public IndexedWord apply(String line) {

/* Comments.
* TODO[sebschu]: Save them somewhere such that they can be output again.
*/
if (line.startsWith("#")) {
return IndexedWord.NO_WORD;
}

String[] bits = line.split("\\s+");

IndexedWord word = new IndexedWord();
word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
word.set(CoreAnnotations.TextAnnotation.class, bits[1]);
word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);

word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, bits[8]);
word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);
/* Check if it is a multiword token. */
if (bits[0].contains("-")) {
String[] span = bits[0].split("-");
Integer start = Integer.parseInt(span[0]);
Integer end = Integer.parseInt(span[1]);
word.set(CoreAnnotations.CoNLLUTokenSpanAnnotation.class, new Pair<>(start, end));
word.set(CoreAnnotations.IndexAnnotation.class, start);
} else {
word.set(CoreAnnotations.IndexAnnotation.class, Integer.parseInt(bits[0]));
word.set(CoreAnnotations.LemmaAnnotation.class, bits[2]);
word.set(CoreAnnotations.CoarseTagAnnotation.class, bits[3]);
word.set(CoreAnnotations.PartOfSpeechAnnotation.class, bits[4]);

word.setIndex(Integer.parseInt(bits[0]));
word.setValue(bits[1]);
word.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, Integer.parseInt(bits[6]));
word.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, bits[7]);
word.set(CoreAnnotations.CoNLLUMisc.class, bits[9]);

/* Parse features. */
HashMap<String, String> features = parseFeatures(bits[5]);
word.setIndex(Integer.parseInt(bits[0]));
word.setValue(bits[1]);

word.set(CoreAnnotations.CoNLLUFeats.class, features);
/* Parse features. */
HashMap<String, String> features = parseFeatures(bits[5]);
word.set(CoreAnnotations.CoNLLUFeats.class, features);

/* Parse extra dependencies. */
HashMap<Integer,String> extraDeps = parseExtraDeps(bits[8]);
word.set(CoreAnnotations.CoNLLUSecondaryDepsAnnotation.class, extraDeps);
}

return word;
return word;
}
}

Expand Down Expand Up @@ -136,7 +201,6 @@ public static HashMap<String,String> parseFeatures(String featureString) {
*
* @return The feature string.
*/

public static String toFeatureString(HashMap<String,String> features) {
StringBuffer sb = new StringBuffer();
boolean first = true;
Expand All @@ -163,6 +227,60 @@ public static String toFeatureString(HashMap<String,String> features) {
return sb.toString();
}

/**
* Parses the value of the extra dependencies column in a CoNLL-U file
* and returns them in a HashMap with the governor indices as keys
* and the relation names as values.
*
* @param extraDepsString
* @return A HashMap<Integer,String> with the additional dependencies.
*/
public static HashMap<Integer,String> parseExtraDeps(String extraDepsString) {
HashMap<Integer,String> extraDeps = new HashMap<>();
if ( ! extraDepsString.equals("_")) {
String[] extraDepParts = extraDepsString.split("\\|");
for (String extraDepString : extraDepParts) {
int sepPos = extraDepString.lastIndexOf(":");
String reln = extraDepString.substring(sepPos + 1);
Integer gov = Integer.parseInt(extraDepString.substring(0, sepPos));
extraDeps.put(gov, reln);
}
}
return extraDeps;
}

/**
* Converts an extra dependencies hash map to a string to be used
* in a CoNLL-U file.
*
* @param extraDeps
* @return The extra dependencies string.
*/
public static String toExtraDepsString(HashMap<Integer,String> extraDeps) {
StringBuffer sb = new StringBuffer();
boolean first = true;
List<Integer> sortedKeys = new ArrayList<>(extraDeps.keySet());
Collections.sort(sortedKeys);
for (Integer key : sortedKeys) {
if ( ! first) {
sb.append("|");
} else {
first = false;
}

sb.append(key)
.append(":")
.append(extraDeps.get(key));
}

/* Empty feature list. */
if (first) {
sb.append("_");
}
return sb.toString();
}


public static class FeatureNameComparator implements Comparator<String> {

@Override
Expand Down
90 changes: 90 additions & 0 deletions test/src/edu/stanford/nlp/trees/CoNLLUDocumentReaderTest.java
@@ -0,0 +1,90 @@
package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import junit.framework.TestCase;

import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;

/**
* @author Sebastian Schuster
*/
public class CoNLLUDocumentReaderITest extends TestCase {

private static String MULTIWORD_TEST_INPUT =
"1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _\n" +
"2-3 haven't _ _ _ _ _ _ _ _\n" +
"2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _\n" +
"3 not not PART RB Negative=Neg 2 neg _ _\n" +
"4 a a DET DT Definite=Ind|PronType=Art 5 det _ _\n" +
"5 clue clue NOUN NN Number=Sing 2 dobj _ _\n" +
"6 . . PUNCT . _ 2 punct _ _\n\n";

private static String COMMENT_TEST_INPUT =
"#comment line 1\n" +
"#comment line 2\n" +
"1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _\n" +
"2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _\n" +
"3 not not PART RB Negative=Neg 2 neg _ _\n" +
"4 a a DET DT Definite=Ind|PronType=Art 5 det _ _\n" +
"5 clue clue NOUN NN Number=Sing 2 dobj _ _\n" +
"6 . . PUNCT . _ 2 punct _ _\n\n";

private static String EXTRA_DEPS_TEST_INPUT =
"1 They They PRON PRP _ 2 nsubj 4:nsubj _\n" +
"2 buy buy VERB VBP _ 0 root _ _\n" +
"3 and and CONJ CC _ 2 cc _ _\n" +
"4 sell sell VERB VBP _ 5 conj _ _\n" +
"5 books book NOUN NNS _ 2 dobj 4:dobj _\n" +
"6 , , PUNCT , _ 5 punct _ _\n" +
"7 newspapers newspaper NOUN NNS _ 5 conj 2:dobj|4:dobj _\n" +
"8 and and CONJ CC _ 5 cc _ _\n" +
"9 magazines magazine NOUN NNS _ 5 conj 2:dobj|4:dobj _\n" +
"10 . . PUNCT . _ 2 punct _ _\n\n";


public void testMultiWords() {
CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
Reader stringReader = new StringReader(MULTIWORD_TEST_INPUT);
Iterator<SemanticGraph> it = reader.getIterator(stringReader);

SemanticGraph sg = it.next();
assertNotNull(sg);
assertFalse("The input only contains one dependency tree.", it.hasNext());
assertEquals("[have/VBP nsubj>I/PRP neg>not/RB dobj>[clue/NN det>a/DT] punct>./.]", sg.toCompactString(true));

for (IndexedWord iw : sg.vertexListSorted()) {
if (iw.index() != 2 && iw.index() != 3) {
assertEquals("", iw.originalText());
} else {
assertEquals("haven't", iw.originalText());
}
}
}

public void testComment() {
CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
Reader stringReader = new StringReader(COMMENT_TEST_INPUT);
Iterator<SemanticGraph> it = reader.getIterator(stringReader);

SemanticGraph sg = it.next();
assertNotNull(sg);
assertFalse("The input only contains one dependency tree.", it.hasNext());
assertEquals("[have/VBP nsubj>I/PRP neg>not/RB dobj>[clue/NN det>a/DT] punct>./.]", sg.toCompactString(true));
}

public void testExtraDependencies() {
CoNLLUDocumentReader reader = new CoNLLUDocumentReader();
Reader stringReader = new StringReader(EXTRA_DEPS_TEST_INPUT);
Iterator<SemanticGraph> it = reader.getIterator(stringReader);

SemanticGraph sg = it.next();
assertNotNull(sg);
assertFalse("The input only contains one dependency tree.", it.hasNext());
assertTrue(sg.containsEdge(sg.getNodeByIndex(4), sg.getNodeByIndex(1)));
assertTrue(sg.containsEdge(sg.getNodeByIndex(2), sg.getNodeByIndex(7)));
assertTrue(sg.containsEdge(sg.getNodeByIndex(4), sg.getNodeByIndex(7)));
}
}

0 comments on commit cf31ad0

Please sign in to comment.