Skip to content

Commit

Permalink
Add a two-word MergeNodes operation.
Browse files Browse the repository at this point in the history
MergeNodes operation allows for arbitrary attributes on the new word,
or makes a best effort at putting a combined word & lemma on the new
combined word if nothing was specified.

Would like to extend it to handle multiple words at once
  • Loading branch information
AngledLuffa committed Mar 26, 2023
1 parent db0bd45 commit 0660fa9
Show file tree
Hide file tree
Showing 4 changed files with 323 additions and 5 deletions.
15 changes: 10 additions & 5 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
Expand Up @@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor
}
}

public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination) {
/**
* reverse: operate in reverse order, highest index to first. You want true if moving indices up, false if moving indices down
*/
public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination, boolean reverse) {
// iterate first, then move, so that we don't screw up the graph while iterating
List<IndexedWord> toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList());
Collections.sort(toMove);
Collections.reverse(toMove);
if (reverse) {
Collections.reverse(toMove);
}
for (IndexedWord word : toMove) {
moveNode(sg, sm, word, destination.apply(word.index()));
}
Expand All @@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
// +2 to leave room: we will increase all other nodes with the
// proper index, so we need +1 of room, then another +1 for
// a temp place to put this node
// TODO: when we implement updating the SemgrexMatcher,
// this won't be necessary
// TODO: we could theoretically put the new node in the right place
// immediately and move the other nodes, but this is easier
tempIndex = SemanticGraphUtils.maxIndex(sg) + 2;

if (position.equals("-")) {
Expand Down Expand Up @@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
if (position != null && !position.equals("+")) {
// the payoff for tempIndex == maxIndex + 2:
// everything will be moved one higher, unless it's the new node
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1);
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true);
moveNode(sg, sm, newNode, newIndex);
}

Expand Down
159 changes: 159 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
@@ -0,0 +1,159 @@
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;

/**
* Combines two words into one word
*<br>
* This requires one of the nodes to be the head of a phrase of the words,
* and the dependent words can't have any extra edges in or out of that subgraph
*<br>
* The word and lemma will be the combination of the words, squished together.
* Before and after will be updated to use the before and after of the endpoints of the subgraph
*
* @author John Bauer
*/
public class MergeNodes extends SsurgeonEdit {
public static final String LABEL = "mergeNodes";
final String name1;
final String name2;
final Map<String, String> attributes;

public MergeNodes(String name1, String name2, Map<String, String> attributes) {
this.name1 = name1;
this.name2 = name2;
this.attributes = new TreeMap<>(attributes);
}

/**
* Emits a parseable instruction string.
*/
@Override
public String toEditString() {
StringWriter buf = new StringWriter();
buf.write(LABEL); buf.write("\t");
buf.write(name1); buf.write("\t");
buf.write(name2);

// TODO: some attributes might need to be escaped!
for (String key : attributes.keySet()) {
buf.write("\t-");
buf.write(key);
buf.write(" ");
buf.write(attributes.get(key));
}

return buf.toString();
}

/**
* If the two named nodes are next to each other, and the edges of
* the graph allow for it, squish the two words into one word
*/
@Override
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
IndexedWord node1 = sm.getNode(name1);
IndexedWord node2 = sm.getNode(name2);

if (node1 == null || node2 == null) {
return false;
}

List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
return false;
}

// TODO: what about the case where the dep is or has copies?
final IndexedWord head;
final IndexedWord dep;

if (n1_to_n2.size() > 0) {
head = node1;
dep = node2;
} else {
head = node2;
dep = node1;
}

// If the dep has any edges that aren't between dep & head, abort
// TODO: we could probably make it adjust edges with "dep" as source, instead
for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
if (e.getTarget() != head) {
return false;
}
}
for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
if (e.getSource() != head) {
return false;
}
}

IndexedWord left;
IndexedWord right;
if (node1.index() < node2.index()) {
left = node1;
right = node2;
} else {
left = node2;
right = node1;
}

CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
if (newLabel.word() == null) {
String newWord = left.word() + right.word();
newLabel.setWord(newWord);
}
if (newLabel.value() == null) {
newLabel.setValue(newLabel.word());
}
if (newLabel.lemma() == null) {
String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
newLabel.setLemma(newLemma);
}
// after() and before() return "" if null, so we need to use the CoreAnnotations directly
if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
newLabel.setAfter(right.after());
}
if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
newLabel.setBefore(right.before());
}

for (IndexedWord vertex : sg.vertexSet()) {
if (vertex.index() == head.index()) {
for (Class key : newLabel.keySet()) {
Object value = newLabel.get(key);
vertex.set(key, value);
}
}
}

// copy the list so that deletion doesn't hurt the iterator
// TODO: super fancy would be implementing iterator.remove()
// on the Set returned by the SemanticGraph
for (IndexedWord vertex : sg.vertexListSorted()) {
if (vertex.index() == dep.index()) {
sg.removeVertex(vertex);
}
}

// reindex everyone
AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);

return true;
}

}

13 changes: 13 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
Expand Up @@ -82,6 +82,7 @@
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
* <li> {@code editNode -node node ...attributes...}
* <li> {@code setRoots n1 (n2 n3 ...)}
* <li> {@code mergeNodes n1 n2}
* <li> {@code killAllIncomingEdges -node node}
* <li> {@code deleteGraphFromNode -node node}
* <li> {@code killNonRootedNodes}
Expand Down Expand Up @@ -134,6 +135,11 @@
* This is best done in conjunction with other operations which actually manipulate the structure
* of the graph, or the new root will weirdly have dependents and the graph will be incorrect.
*</p><p>
* {@code mergeNodes} will merge n1 and n2, assuming they are mergeable.
* The nodes can be merged if one of the nodes is the head of a phrase
* and the other node depends on the head. TODO: can make it process
* more than two nodes at once.
*</p><p>
* {@code killAllIncomingEdges} deletes all edges to a node.
* {@code -node} is the node to edit.
* Note that this is the same as {@code removeEdge} with only the dependent set.
Expand Down Expand Up @@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
String[] names = tuples1[1].split("\\s+");
List<String> newRoots = Arrays.asList(names);
return new SetRoots(newRoots);
} else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
String[] names = tuples1[1].split("\\s+", 3);
if (names.length == 2 && attributeArgs.size() == 0) {
return new MergeNodes(names[0], names[1], Collections.emptyMap());
}
final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs);
return new MergeNodes(names[0], names[1], argsBox.annotations);
} else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) {
return new KillNonRootedNodes();
}
Expand Down
141 changes: 141 additions & 0 deletions test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
Expand Up @@ -1063,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() {
}
}

/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodes() {
Ssurgeon inst = Ssurgeon.inst();

// Test the head word being the first word
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
IndexedWord prof = sg.getNodeByIndexSafe(3);
assertNotNull(prof);
assertEquals("prof.", prof.word());
assertEquals("prof.", prof.value());
assertNull(prof.lemma());

// Same test, but this time test merging the lemmas
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
sg.getNodeByIndexSafe(3).setLemma("prof");
sg.getNodeByIndexSafe(4).setLemma(".");
newSG = mergeSsurgeon.iterate(sg).first;
assertEquals(expected, newSG);
prof = sg.getNodeByIndexSafe(3);
assertEquals("prof.", prof.lemma());

// Test the head word being the second word
merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
mergeSsurgeon = patterns.get(0);

// Check what happens if the root of the phrase is on the right and the dep is on the left
// The words & lemmas should still hopefully be merged in order
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
sg.getNodeByIndexSafe(3).setLemma(".");
assertEquals(".", sg.getNodeByIndexSafe(3).word());
sg.getNodeByIndexSafe(4).setLemma("prof");
newSG = mergeSsurgeon.iterate(sg).first;
expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
prof = newSG.getNodeByIndexSafe(3);
assertEquals(".prof", prof.word());
assertEquals(".prof", prof.lemma());
}


/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodesAttributes() {
Ssurgeon inst = Ssurgeon.inst();

// Test the head word being the first word
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct -word foo -lemma bar</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
IndexedWord prof = sg.getNodeByIndexSafe(3);
assertNotNull(prof);
assertEquals("foo", prof.word());
assertEquals("foo", prof.value());
assertEquals("bar", prof.lemma());
}

/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodesFailCases() {
Ssurgeon inst = Ssurgeon.inst();

// use "dep" as the dependency so as to be language-agnostic in this test
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

// Add an extra edge from the punct we want to squash to somewhere else
// The graph should not be changed
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
assertEquals(expected, newSG);

sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
newSG = mergeSsurgeon.iterate(sg).first;
expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
}

/**
* The AddDep should update the matches in the SemgrexMatcher.
* If that isn't done correctly, then moving the words first
Expand Down

0 comments on commit 0660fa9

Please sign in to comment.