From b118082c1403c8ad7d3c18fc7a211d24fcccb173 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 2 Jan 2023 12:08:07 -0800 Subject: [PATCH] Add a Java command line tool which converts trees to dependency graphs using protobufs. Included is an update to SemanticGraph.valueOf to set a sentIndex and an option in Tree to yield CoreLabels with the word as the Value instead of the tag --- .../stanford/nlp/semgraph/SemanticGraph.java | 30 ++++++-- .../ProcessDependencyConverterRequest.java | 75 +++++++++++++++++++ src/edu/stanford/nlp/trees/Tree.java | 24 +++++- ...ProcessDependencyConverterRequestTest.java | 72 ++++++++++++++++++ 4 files changed, 191 insertions(+), 10 deletions(-) create mode 100644 src/edu/stanford/nlp/trees/ProcessDependencyConverterRequest.java create mode 100644 test/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequestTest.java diff --git a/src/edu/stanford/nlp/semgraph/SemanticGraph.java b/src/edu/stanford/nlp/semgraph/SemanticGraph.java index ddc057f4c6..613e6a6019 100644 --- a/src/edu/stanford/nlp/semgraph/SemanticGraph.java +++ b/src/edu/stanford/nlp/semgraph/SemanticGraph.java @@ -1693,18 +1693,31 @@ public SemanticGraphEdge addEdge(SemanticGraphEdge edge) { * * This is the same format generated by toCompactString(). */ + public static SemanticGraph valueOf(String s, Language language, Integer sentIndex) { + return (new SemanticGraphParsingTask(s, language, sentIndex)).parse(); + } + + /** + * @see SemanticGraph#valueOf(String, Language, Integer) + */ public static SemanticGraph valueOf(String s, Language language) { return (new SemanticGraphParsingTask(s, language)).parse(); } /** - * - * @see SemanticGraph#valueOf(String, Language) + * @see SemanticGraph#valueOf(String, Language, Integer) */ public static SemanticGraph valueOf(String s) { return valueOf(s, Language.UniversalEnglish); } + /** + * @see SemanticGraph#valueOf(String, Language, Integer) + */ + public static SemanticGraph valueOf(String s, int sentIndex) { + return valueOf(s, Language.UniversalEnglish, sentIndex); + } + public SemanticGraph() { graph = new DirectedMultiGraph<>(outerMapFactory, innerMapFactory); @@ -1838,16 +1851,21 @@ private static class SemanticGraphParsingTask extends StringParsingTask indexesUsed = Generics.newHashSet(); - private Language language; - + private final Language language; + private final Integer sentIndex; public SemanticGraphParsingTask(String s) { - this(s, Language.UniversalEnglish); + this(s, Language.UniversalEnglish, null); } public SemanticGraphParsingTask(String s, Language language) { + this(s, language, null); + } + + public SemanticGraphParsingTask(String s, Language language, Integer sentIndex) { super(s); this.language = language; + this.sentIndex = sentIndex; } @Override @@ -1909,7 +1927,7 @@ private IndexedWord makeVertex(String word) { // nothing is actually enforcing that no indexes are used twice. This // could occur if some words in the string representation being parsed // come with index markers and some do not. - IndexedWord ifl = new IndexedWord(null, 0, index); + IndexedWord ifl = new IndexedWord(null, sentIndex != null ? sentIndex : 0, index); // log.info("SemanticGraphParsingTask>>> word = " + word); // log.info("SemanticGraphParsingTask>>> index = " + index); // log.info("SemanticGraphParsingTask>>> indexesUsed = " + diff --git a/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequest.java b/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequest.java new file mode 100644 index 0000000000..a6486575df --- /dev/null +++ b/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequest.java @@ -0,0 +1,75 @@ +package edu.stanford.nlp.trees; + +/** + * A tool to turn Tree objects into dependencies + * + * Only works for English (at least for now) + */ + +import java.io.InputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.List; +import java.util.stream.Collectors; + +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.pipeline.CoreNLPProtos; +import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphFactory; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.ProcessProtobufRequest; + +public class ProcessDependencyConverterRequest extends ProcessProtobufRequest { + /** + * Convert a single Tree to basic dependencies + */ + static SemanticGraph convert(Tree tree) { + SemanticGraph uncollapsedDeps = SemanticGraphFactory.generateUncollapsedDependencies(tree); + return uncollapsedDeps; + } + + /** + * Process a single request, responding with basic dependencies for each tree + */ + static CoreNLPProtos.DependencyConverterResponse processRequest(CoreNLPProtos.DependencyConverterRequest request) { + ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(); + CoreNLPProtos.DependencyConverterResponse.Builder responseBuilder = CoreNLPProtos.DependencyConverterResponse.newBuilder(); + + List flattenedTrees = request.getTreesList(); + int treeIdx = 0; + for (CoreNLPProtos.FlattenedParseTree flattenedTree : flattenedTrees) { + Tree tree = ProtobufAnnotationSerializer.fromProto(flattenedTree); + SemanticGraph graph = convert(tree); + for (IndexedWord node : graph.vertexSet()) { + node.set(CoreAnnotations.SentenceIndexAnnotation.class, treeIdx); + } + CoreNLPProtos.DependencyConverterResponse.DependencyConversion.Builder conversionBuilder = CoreNLPProtos.DependencyConverterResponse.DependencyConversion.newBuilder(); + conversionBuilder.setGraph(ProtobufAnnotationSerializer.toProto(graph)); + conversionBuilder.setTree(flattenedTree); + responseBuilder.addConversions(conversionBuilder.build()); + ++treeIdx; + } + return responseBuilder.build(); + } + + /** + * Process a single request from a stream, responding with basic dependencies for each tree + */ + @Override + public void processInputStream(InputStream in, OutputStream out) throws IOException { + CoreNLPProtos.DependencyConverterRequest request = CoreNLPProtos.DependencyConverterRequest.parseFrom(in); + CoreNLPProtos.DependencyConverterResponse response = processRequest(request); + response.writeTo(out); + } + + /** + * The inherited main program will either enhance a single document, + * or will listen to stdin and enhance every document that comes in + * until a terminator is sent or the stream closes + */ + public static void main(String[] args) throws IOException { + ProcessProtobufRequest.process(new ProcessDependencyConverterRequest(), args); + } +} diff --git a/src/edu/stanford/nlp/trees/Tree.java b/src/edu/stanford/nlp/trees/Tree.java index e79363438d..440e66a17f 100644 --- a/src/edu/stanford/nlp/trees/Tree.java +++ b/src/edu/stanford/nlp/trees/Tree.java @@ -1625,12 +1625,24 @@ public List labeledYield(List ty) { * @return A tagged, labeled yield. */ public List taggedLabeledYield() { + return taggedLabeledYield(true); + } + + + /** Returns a {@code List} from the tree. + * These are a copy of the complete token representation + * along with the tag. + * + * @param tagValues use the tags for the values (otherwise use the leaf) + * @return A tagged, labeled yield. + */ + public List taggedLabeledYield(boolean tagValues) { List ty = new ArrayList<>(); - taggedLabeledYield(ty, 0); + taggedLabeledYield(ty, 0, tagValues); return ty; } - private int taggedLabeledYield(List ty, int termIdx) { + private int taggedLabeledYield(List ty, int termIdx, boolean tagValues) { if (isPreTerminal()) { // usually this will fill in all the usual keys for a token CoreLabel taggedWord = new CoreLabel(firstChild().label()); @@ -1640,7 +1652,11 @@ private int taggedLabeledYield(List ty, int termIdx) { } final String tag = (value() == null) ? "" : value(); // set value and tag to the tag - taggedWord.setValue(tag); + if (tagValues) { + taggedWord.setValue(tag); + } else { + taggedWord.setValue(taggedWord.word()); + } taggedWord.setTag(tag); taggedWord.setIndex(termIdx); ty.add(taggedWord); @@ -1649,7 +1665,7 @@ private int taggedLabeledYield(List ty, int termIdx) { } else { for (Tree kid : getChildrenAsList()) - termIdx = kid.taggedLabeledYield(ty, termIdx); + termIdx = kid.taggedLabeledYield(ty, termIdx, tagValues); } return termIdx; diff --git a/test/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequestTest.java b/test/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequestTest.java new file mode 100644 index 0000000000..f341b52423 --- /dev/null +++ b/test/src/edu/stanford/nlp/trees/ProcessDependencyConverterRequestTest.java @@ -0,0 +1,72 @@ +package edu.stanford.nlp.trees; + +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.Assert; +import org.junit.Test; + +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.pipeline.CoreNLPProtos; +import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.trees.Tree; + +public class ProcessDependencyConverterRequestTest { + + static CoreNLPProtos.DependencyConverterRequest buildRequest(String ... trees) { + CoreNLPProtos.DependencyConverterRequest.Builder builder = CoreNLPProtos.DependencyConverterRequest.newBuilder(); + + for (String tree : trees) { + Tree t = Tree.valueOf(tree); + builder.addTrees(ProtobufAnnotationSerializer.toFlattenedTree(t)); + } + + return builder.build(); + } + + static void checkResults(CoreNLPProtos.DependencyConverterResponse response, String ... expectedResults) { + Assert.assertEquals(expectedResults.length, response.getConversionsList().size()); + for (int i = 0; i < expectedResults.length; ++i) { + CoreNLPProtos.DependencyGraph responseGraph = response.getConversionsList().get(i).getGraph(); + CoreNLPProtos.FlattenedParseTree responseTree = response.getConversionsList().get(i).getTree(); + Tree tree = ProtobufAnnotationSerializer.fromProto(responseTree); + List sentence = tree.taggedLabeledYield(false); + + SemanticGraph expected = SemanticGraph.valueOf(expectedResults[i], i); + SemanticGraph graph = ProtobufAnnotationSerializer.fromProto(responseGraph, sentence, null); + //for (IndexedWord word : expected.vertexSet()) { + // System.out.println(word + " " + word.index() + " " + word.sentIndex() + " " + word.docID()); + //} + //for (IndexedWord word : graph.vertexSet()) { + // System.out.println(word + " " + word.index() + " " + word.sentIndex() + " " + word.docID()); + //} + //System.out.println(expected.toCompactString()); + //System.out.println(graph.toCompactString()); + Assert.assertEquals(expected, graph); + } + } + + /** Test a single Tree turning into Dependencies */ + @Test + public void testOneTree() { + CoreNLPProtos.DependencyConverterRequest request = buildRequest("(ROOT (S (NP (NNP Jennifer)) (VP (VBZ has) (NP (JJ nice) (NNS antennae)))))"); + CoreNLPProtos.DependencyConverterResponse response = ProcessDependencyConverterRequest.processRequest(request); + checkResults(response, "[has/VBZ-1 nsubj>Jennifer/NNP-0 obj>[antennae/NNS-3 amod>nice/JJ-2]]"); + } + + /** Test two trees turning into Dependencies */ + @Test + public void testTwoTrees() { + CoreNLPProtos.DependencyConverterRequest request = buildRequest("(ROOT (S (NP (NNP Jennifer)) (VP (VBZ has) (NP (JJ nice) (NNS antennae)))))", + "(ROOT (S (NP (PRP She)) (VP (VBZ is) (ADJP (RB hella) (JJ basic)) (ADVP (RB though)))))"); + CoreNLPProtos.DependencyConverterResponse response = ProcessDependencyConverterRequest.processRequest(request); + checkResults(response, + "[has/VBZ-1 nsubj>Jennifer/NNP-0 obj>[antennae/NNS-3 amod>nice/JJ-2]]", + "[basic/JJ-3 nsubj>She/PRP-0 cop>is/VBZ-1 advmod>hella/RB-2 advmod>though/RB-4]"); + } + +} + +