Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ssurgeon - add MergeNodes operation #1351

Merged
merged 2 commits into from
Mar 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor
}
}

public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination) {
/**
* reverse: operate in reverse order, highest index to first. You want true if moving indices up, false if moving indices down
*/
public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination, boolean reverse) {
// iterate first, then move, so that we don't screw up the graph while iterating
List<IndexedWord> toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList());
Collections.sort(toMove);
Collections.reverse(toMove);
if (reverse) {
Collections.reverse(toMove);
}
for (IndexedWord word : toMove) {
moveNode(sg, sm, word, destination.apply(word.index()));
}
Expand All @@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
// +2 to leave room: we will increase all other nodes with the
// proper index, so we need +1 of room, then another +1 for
// a temp place to put this node
// TODO: when we implement updating the SemgrexMatcher,
// this won't be necessary
// TODO: we could theoretically put the new node in the right place
// immediately and move the other nodes, but this is easier
tempIndex = SemanticGraphUtils.maxIndex(sg) + 2;

if (position.equals("-")) {
Expand Down Expand Up @@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
if (position != null && !position.equals("+")) {
// the payoff for tempIndex == maxIndex + 2:
// everything will be moved one higher, unless it's the new node
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1);
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true);
moveNode(sg, sm, newNode, newIndex);
}

Expand Down
157 changes: 157 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;

/**
* Combines two words into one word
*<br>
* This requires one of the nodes to be the head of a phrase of the words,
* and the dependent words can't have any extra edges in or out of that subgraph
*<br>
* The word and lemma will be the combination of the words, squished together.
* Before and after will be updated to use the before and after of the endpoints of the subgraph
*
* @author John Bauer
*/
public class MergeNodes extends SsurgeonEdit {
public static final String LABEL = "mergeNodes";
final String name1;
final String name2;
final Map<String, String> attributes;

public MergeNodes(String name1, String name2, Map<String, String> attributes) {
this.name1 = name1;
this.name2 = name2;
this.attributes = new TreeMap<>(attributes);
}

/**
* Emits a parseable instruction string.
*/
@Override
public String toEditString() {
StringWriter buf = new StringWriter();
buf.write(LABEL); buf.write("\t");
buf.write(name1); buf.write("\t");
buf.write(name2);

// TODO: some attributes might need to be escaped!
for (String key : attributes.keySet()) {
buf.write("\t-");
buf.write(key);
buf.write(" ");
buf.write(attributes.get(key));
}

return buf.toString();
}

/**
* If the two named nodes are next to each other, and the edges of
* the graph allow for it, squish the two words into one word
*/
@Override
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
IndexedWord node1 = sm.getNode(name1);
IndexedWord node2 = sm.getNode(name2);

if (node1 == null || node2 == null) {
return false;
}

List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
return false;
}

// TODO: what about the case where the dep is or has copies?
final IndexedWord head;
final IndexedWord dep;

if (n1_to_n2.size() > 0) {
head = node1;
dep = node2;
} else {
head = node2;
dep = node1;
}

// If the dep has any edges that aren't between dep & head, abort
// TODO: we could probably make it adjust edges with "dep" as source, instead
for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
if (e.getTarget() != head) {
return false;
}
}
for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
if (e.getSource() != head) {
return false;
}
}

IndexedWord left;
IndexedWord right;
if (node1.index() < node2.index()) {
left = node1;
right = node2;
} else {
left = node2;
right = node1;
}

CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
if (newLabel.word() == null) {
String newWord = left.word() + right.word();
newLabel.setWord(newWord);
}
if (newLabel.value() == null) {
newLabel.setValue(newLabel.word());
}
if (newLabel.lemma() == null) {
String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
newLabel.setLemma(newLemma);
}
if (newLabel.after() == null) {
newLabel.setAfter(right.after());
}
if (newLabel.before() == null) {
newLabel.setBefore(right.before());
}

for (IndexedWord vertex : sg.vertexSet()) {
if (vertex.index() == head.index()) {
for (Class key : newLabel.keySet()) {
Object value = newLabel.get(key);
vertex.set(key, value);
}
}
}

// copy the list so that deletion doesn't hurt the iterator
// TODO: super fancy would be implementing iterator.remove()
// on the Set returned by the SemanticGraph
for (IndexedWord vertex : sg.vertexListSorted()) {
if (vertex.index() == dep.index()) {
sg.removeVertex(vertex);
}
}

// reindex everyone
AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);

return true;
}

}

13 changes: 13 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
* <li> {@code editNode -node node ...attributes...}
* <li> {@code setRoots n1 (n2 n3 ...)}
* <li> {@code mergeNodes n1 n2}
* <li> {@code killAllIncomingEdges -node node}
* <li> {@code deleteGraphFromNode -node node}
* <li> {@code killNonRootedNodes}
Expand Down Expand Up @@ -134,6 +135,11 @@
* This is best done in conjunction with other operations which actually manipulate the structure
* of the graph, or the new root will weirdly have dependents and the graph will be incorrect.
*</p><p>
* {@code mergeNodes} will merge n1 and n2, assuming they are mergeable.
* The nodes can be merged if one of the nodes is the head of a phrase
* and the other node depends on the head. TODO: can make it process
* more than two nodes at once.
*</p><p>
* {@code killAllIncomingEdges} deletes all edges to a node.
* {@code -node} is the node to edit.
* Note that this is the same as {@code removeEdge} with only the dependent set.
Expand Down Expand Up @@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
String[] names = tuples1[1].split("\\s+");
List<String> newRoots = Arrays.asList(names);
return new SetRoots(newRoots);
} else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
String[] names = tuples1[1].split("\\s+", 3);
if (names.length == 2 && attributeArgs.size() == 0) {
return new MergeNodes(names[0], names[1], Collections.emptyMap());
}
final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs);
return new MergeNodes(names[0], names[1], argsBox.annotations);
} else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) {
return new KillNonRootedNodes();
}
Expand Down
142 changes: 142 additions & 0 deletions test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,7 @@ public void readXMLAddUniversalDep() {
assertEquals(Language.UniversalEnglish, edge.getRelation().getLanguage());
}
// they look the same, but they're really not
// the edge is of a different Relation formalism
assertEquals(expected.toString(), newSG.toString());
assertNotEquals(expected, newSG);

Expand Down Expand Up @@ -1062,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() {
}
}

/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodes() {
Ssurgeon inst = Ssurgeon.inst();

// Test the head word being the first word
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
IndexedWord prof = sg.getNodeByIndexSafe(3);
assertNotNull(prof);
assertEquals("prof.", prof.word());
assertEquals("prof.", prof.value());
assertNull(prof.lemma());

// Same test, but this time test merging the lemmas
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
sg.getNodeByIndexSafe(3).setLemma("prof");
sg.getNodeByIndexSafe(4).setLemma(".");
newSG = mergeSsurgeon.iterate(sg).first;
assertEquals(expected, newSG);
prof = sg.getNodeByIndexSafe(3);
assertEquals("prof.", prof.lemma());

// Test the head word being the second word
merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
mergeSsurgeon = patterns.get(0);

// Check what happens if the root of the phrase is on the right and the dep is on the left
// The words & lemmas should still hopefully be merged in order
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
sg.getNodeByIndexSafe(3).setLemma(".");
assertEquals(".", sg.getNodeByIndexSafe(3).word());
sg.getNodeByIndexSafe(4).setLemma("prof");
newSG = mergeSsurgeon.iterate(sg).first;
expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
prof = newSG.getNodeByIndexSafe(3);
assertEquals(".prof", prof.word());
assertEquals(".prof", prof.lemma());
}


/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodesAttributes() {
Ssurgeon inst = Ssurgeon.inst();

// Test the head word being the first word
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct -word foo -lemma bar</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
IndexedWord prof = sg.getNodeByIndexSafe(3);
assertNotNull(prof);
assertEquals("foo", prof.word());
assertEquals("foo", prof.value());
assertEquals("bar", prof.lemma());
}

/**
* Test a basic case of two nodes that should be merged
*<br>
* The indices should be changed as well
*/
@Test
public void readXMLMergeNodesFailCases() {
Ssurgeon inst = Ssurgeon.inst();

// use "dep" as the dependency so as to be language-agnostic in this test
String merge = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Merge two nodes that should not have been split</notes>",
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
" <edit-list>mergeNodes source punct</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(merge);
assertEquals(patterns.size(), 1);
SsurgeonPattern mergeSsurgeon = patterns.get(0);

// Add an extra edge from the punct we want to squash to somewhere else
// The graph should not be changed
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
assertEquals(expected, newSG);

sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
newSG = mergeSsurgeon.iterate(sg).first;
expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
assertEquals(expected, newSG);
}

/**
* The AddDep should update the matches in the SemgrexMatcher.
* If that isn't done correctly, then moving the words first
Expand Down