Skip to content

Commit

Permalink
Make SemgrexBatchParser into a static methods class.
Browse files Browse the repository at this point in the history
  • Loading branch information
manning authored and Stanford NLP committed Oct 5, 2015
1 parent 593862d commit fb65b6c
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 58 deletions.
108 changes: 55 additions & 53 deletions src/edu/stanford/nlp/semgraph/semgrex/SemgrexBatchParser.java
Expand Up @@ -10,76 +10,77 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
* Parses a batch of SemgrexPatterns from a stream
* Each SemgrexPattern must be defined in a single line
* Parses a batch of SemgrexPatterns from a stream.
* Each SemgrexPattern must be defined in a single line.
* This includes a preprocessor that supports macros, defined as: "macro NAME = VALUE" and used as ${NAME}
* For example:
* # lines starting with the pound sign are skipped
* macro JOB = president|ceo|star
* {}=entity >appos ({lemma:/${JOB}/} >nn {ner:ORGANIZATION}=slot)
*/
public class SemgrexBatchParser {

/** Maximum stream size in characters */
private static final int MAX_STREAM_SIZE = 1024 * 1024;

public List<SemgrexPattern> compileStream(InputStream is) throws IOException {
private SemgrexBatchParser() { } // static methods class

public static List<SemgrexPattern> compileStream(InputStream is) throws IOException {
return compileStream(is, null);
}

public List<SemgrexPattern> compileStream(InputStream is, Env env) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
reader.mark(MAX_STREAM_SIZE);
Map<String, String> macros = preprocess(reader);
reader.reset();
return parse(reader, macros, env);
}
private List<SemgrexPattern> parse(BufferedReader reader, Map<String, String> macros, Env env) throws IOException {
List<SemgrexPattern> patterns = new ArrayList<SemgrexPattern>();
for(String line; (line = reader.readLine()) != null; ) {
public static List<SemgrexPattern> compileStream(InputStream is, Env env) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
reader.mark(MAX_STREAM_SIZE);
Map<String, String> macros = preprocess(reader);
reader.reset();
return parse(reader, macros, env);
}

private static List<SemgrexPattern> parse(BufferedReader reader, Map<String, String> macros, Env env) throws IOException {
List<SemgrexPattern> patterns = new ArrayList<>();
for(String line; (line = reader.readLine()) != null; ) {
line = line.trim();
if(line.length() == 0 || line.startsWith("#")) continue;
if(line.isEmpty() || line.startsWith("#")) continue;
if(line.startsWith("macro ")) continue;
line = replaceMacros(line, macros);
SemgrexPattern pattern = SemgrexPattern.compile(line, env);
patterns.add(pattern);
}
return patterns;
}
private static final Pattern MACRO_NAME_PATTERN = Pattern.compile("\\$\\{[a-z0-9]+\\}", Pattern.CASE_INSENSITIVE);
private String replaceMacros(String line, Map<String, String> macros) {
StringBuffer out = new StringBuffer();
Matcher matcher = MACRO_NAME_PATTERN.matcher(line);
int offset = 0;
while(matcher.find(offset)) {
int start = matcher.start();
int end = matcher.end();
String name = line.substring(start + 2, end - 1);
String value = macros.get(name);
if(value == null){
throw new RuntimeException("ERROR: Unknown macro \"" + name + "\"!");
}
if(start > offset) {
out.append(line.substring(offset, start));
}
out.append(value);
offset = end;
}
if(offset < line.length()) out.append(line.substring(offset));
String postProcessed = out.toString();
if(! postProcessed.equals(line)) System.err.println("Line \"" + line + "\" changed to \"" + postProcessed + "\"");
return postProcessed;
}
private Map<String, String> preprocess(BufferedReader reader) throws IOException {
}
return patterns;
}

private static final Pattern MACRO_NAME_PATTERN = Pattern.compile("\\$\\{[a-z0-9]+\\}", Pattern.CASE_INSENSITIVE);

private static String replaceMacros(String line, Map<String, String> macros) {
StringBuilder out = new StringBuilder();
Matcher matcher = MACRO_NAME_PATTERN.matcher(line);
int offset = 0;
while(matcher.find(offset)) {
int start = matcher.start();
int end = matcher.end();
String name = line.substring(start + 2, end - 1);
String value = macros.get(name);
if(value == null){
throw new RuntimeException("ERROR: Unknown macro \"" + name + "\"!");
}
if(start > offset) {
out.append(line.substring(offset, start));
}
out.append(value);
offset = end;
}
if(offset < line.length()) out.append(line.substring(offset));
String postProcessed = out.toString();
if(! postProcessed.equals(line)) System.err.println("Line \"" + line + "\" changed to \"" + postProcessed + '"');
return postProcessed;
}

private static Map<String, String> preprocess(BufferedReader reader) throws IOException {
Map<String, String> macros = Generics.newHashMap();
for(String line; (line = reader.readLine()) != null; ) {
line = line.trim();
Expand All @@ -90,21 +91,22 @@ private Map<String, String> preprocess(BufferedReader reader) throws IOException
}
return macros;
}
private Pair<String, String> extractMacro(String line) {

private static Pair<String, String> extractMacro(String line) {
assert(line.startsWith("macro"));
int equalPosition = line.indexOf('=');
if(equalPosition < 0) {
throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
}
String name = line.substring(5, equalPosition).trim();
if(name.length() == 0) {
if(name.isEmpty()) {
throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
}
String value = line.substring(equalPosition + 1).trim();
if(value.length() == 0) {
if(value.isEmpty()) {
throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
}
return new Pair<String, String>(name, value);
return new Pair<>(name, value);
}

}
Expand Up @@ -16,7 +16,10 @@
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphFactory;


/**
* @author Chloe Kiddon
* @author Sonal Gupta
*/
public class SemgrexPatternTest extends TestCase {

/*
Expand Down Expand Up @@ -120,9 +123,8 @@ public void testFind() throws Exception {
public void testMacro() throws IOException {
SemanticGraph h = SemanticGraph.valueOf("[married/VBN nsubjpass>Hughes/NNP auxpass>was/VBD nmod:to>Gracia/NNP]");
String macro = "macro WORD = married";
SemgrexBatchParser parser = new SemgrexBatchParser();
String pattern = "({word:${WORD}}=parent >>nsubjpass {}=node)";
List<SemgrexPattern> pats = parser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)));
List<SemgrexPattern> pats = SemgrexBatchParser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)));
SemgrexPattern pat3 = pats.get(0);
boolean ignoreCase = true;
SemgrexMatcher mat3 = pat3.matcher(h, ignoreCase);
Expand All @@ -142,11 +144,10 @@ public void testEnv() throws IOException {
//SemanticGraph t = SemanticGraph
// .valueOf("[loved/VBD\nnsubj:Hughes/NNP\ndobj:[wife/NN poss:his/PRP$ appos:Gracia/NNP]\nconj_and:[obsessed/JJ\ncop:was/VBD\nadvmod:absolutely/RB\nprep_with:[Elicia/NN poss:his/PRP$ amod:little/JJ nn:daughter/NN]]]");
String macro = "macro WORD = married";
SemgrexBatchParser parser = new SemgrexBatchParser();
Env env = new Env();
env.bind("pattern1",PatternsAnnotations.PatternLabel1.class);
String pattern = "({pattern1:YES}=parent >>nsubjpass {}=node)";
List<SemgrexPattern> pats = parser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)), env);
List<SemgrexPattern> pats = SemgrexBatchParser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)), env);
SemgrexPattern pat3 = pats.get(0);
boolean ignoreCase = true;
SemgrexMatcher mat3 = pat3.matcher(h, ignoreCase);
Expand Down

0 comments on commit fb65b6c

Please sign in to comment.