Skip to content

Commit

Permalink
Merge branch 'master' of jamie:/u/nlp/git/javanlp
Browse files Browse the repository at this point in the history
  • Loading branch information
Arun Tejasvi Chaganty authored and Stanford NLP committed Jul 26, 2016
1 parent 7b7509a commit 7b26f83
Show file tree
Hide file tree
Showing 13 changed files with 229 additions and 137 deletions.
24 changes: 12 additions & 12 deletions itest/src/edu/stanford/nlp/pipeline/PipelineITest.java
Expand Up @@ -2,7 +2,7 @@


import java.util.List; import java.util.List;


import junit.framework.Assert; import org.junit.Assert;
import junit.framework.TestCase; import junit.framework.TestCase;


import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
Expand Down Expand Up @@ -33,25 +33,25 @@ public void testPipeline() throws Exception {
Annotation document = new Annotation(text); Annotation document = new Annotation(text);
Assert.assertEquals(text, document.toString()); Assert.assertEquals(text, document.toString());
Assert.assertEquals(text, document.get(CoreAnnotations.TextAnnotation.class)); Assert.assertEquals(text, document.get(CoreAnnotations.TextAnnotation.class));

// annotate text with pipeline // annotate text with pipeline
pipeline.annotate(document); pipeline.annotate(document);

// demonstrate typical usage // demonstrate typical usage
for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {

// get the tree for the sentence // get the tree for the sentence
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);


// get the tokens for the sentence and iterate over them // get the tokens for the sentence and iterate over them
for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) { for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {

// get token attributes // get token attributes
String tokenText = token.get(CoreAnnotations.TextAnnotation.class); String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
String tokenPOS = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String tokenPOS = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String tokenLemma = token.get(CoreAnnotations.LemmaAnnotation.class); String tokenLemma = token.get(CoreAnnotations.LemmaAnnotation.class);
String tokenNE = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); String tokenNE = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

// text, pos, lemma and name entity tag should be defined // text, pos, lemma and name entity tag should be defined
Assert.assertNotNull(tokenText); Assert.assertNotNull(tokenText);
Assert.assertNotNull(tokenPOS); Assert.assertNotNull(tokenPOS);
Expand Down Expand Up @@ -83,7 +83,7 @@ public void testPipeline() throws Exception {
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
Assert.assertNotNull(sentences); Assert.assertNotNull(sentences);
Assert.assertEquals(2, sentences.size()); Assert.assertEquals(2, sentences.size());

// sentence 1 // sentence 1
String text1 = "Dan Ramage is working for\nMicrosoft."; String text1 = "Dan Ramage is working for\nMicrosoft.";
CoreMap sentence1 = sentences.get(0); CoreMap sentence1 = sentences.get(0);
Expand All @@ -93,12 +93,12 @@ public void testPipeline() throws Exception {
Assert.assertEquals(36, (int)sentence1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); Assert.assertEquals(36, (int)sentence1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
Assert.assertEquals(0, (int)sentence1.get(CoreAnnotations.TokenBeginAnnotation.class)); Assert.assertEquals(0, (int)sentence1.get(CoreAnnotations.TokenBeginAnnotation.class));
Assert.assertEquals(7, (int)sentence1.get(CoreAnnotations.TokenEndAnnotation.class)); Assert.assertEquals(7, (int)sentence1.get(CoreAnnotations.TokenEndAnnotation.class));

// sentence 1 tree // sentence 1 tree
Tree tree1 = Tree.valueOf("(ROOT (S (NP (NNP Dan) (NNP Ramage)) (VP (VBZ is) " + Tree tree1 = Tree.valueOf("(ROOT (S (NP (NNP Dan) (NNP Ramage)) (VP (VBZ is) " +
"(VP (VBG working) (PP (IN for) (NP (NNP Microsoft))))) (. .)))"); "(VP (VBG working) (PP (IN for) (NP (NNP Microsoft))))) (. .)))");
Assert.assertEquals(tree1, sentence1.get(TreeCoreAnnotations.TreeAnnotation.class)); Assert.assertEquals(tree1, sentence1.get(TreeCoreAnnotations.TreeAnnotation.class));

// sentence 1 tokens // sentence 1 tokens
String tokenText1 = "Dan Ramage is working for Microsoft ."; String tokenText1 = "Dan Ramage is working for Microsoft .";
List<CoreLabel> tokens1 = sentence1.get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> tokens1 = sentence1.get(CoreAnnotations.TokensAnnotation.class);
Expand All @@ -112,7 +112,7 @@ public void testPipeline() throws Exception {
Assert.assertEquals("work", tokens1.get(3).get(CoreAnnotations.LemmaAnnotation.class)); Assert.assertEquals("work", tokens1.get(3).get(CoreAnnotations.LemmaAnnotation.class));
Assert.assertEquals(".", tokens1.get(6).get(CoreAnnotations.LemmaAnnotation.class)); Assert.assertEquals(".", tokens1.get(6).get(CoreAnnotations.LemmaAnnotation.class));
Assert.assertEquals("ORGANIZATION", tokens1.get(5).get(CoreAnnotations.NamedEntityTagAnnotation.class)); Assert.assertEquals("ORGANIZATION", tokens1.get(5).get(CoreAnnotations.NamedEntityTagAnnotation.class));

// sentence 2 // sentence 2
String text2 = "He's in Seattle!"; String text2 = "He's in Seattle!";
CoreMap sentence2 = sentences.get(1); CoreMap sentence2 = sentences.get(1);
Expand All @@ -127,7 +127,7 @@ public void testPipeline() throws Exception {
Tree tree2 = Tree.valueOf("(ROOT (S (NP (PRP He)) (VP (VBZ 's) (PP (IN in) " + Tree tree2 = Tree.valueOf("(ROOT (S (NP (PRP He)) (VP (VBZ 's) (PP (IN in) " +
"(NP (NNP Seattle)))) (. !)))"); "(NP (NNP Seattle)))) (. !)))");
Assert.assertEquals(tree2, sentence2.get(TreeCoreAnnotations.TreeAnnotation.class)); Assert.assertEquals(tree2, sentence2.get(TreeCoreAnnotations.TreeAnnotation.class));

// sentence 2 tokens // sentence 2 tokens
String tokenText2 = "He 's in Seattle !"; String tokenText2 = "He 's in Seattle !";
List<CoreLabel> tokens2 = sentence2.get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> tokens2 = sentence2.get(CoreAnnotations.TokensAnnotation.class);
Expand All @@ -140,7 +140,7 @@ public void testPipeline() throws Exception {
Assert.assertEquals("be", tokens2.get(1).get(CoreAnnotations.LemmaAnnotation.class)); Assert.assertEquals("be", tokens2.get(1).get(CoreAnnotations.LemmaAnnotation.class));
Assert.assertEquals("LOCATION", tokens2.get(3).get(CoreAnnotations.NamedEntityTagAnnotation.class)); Assert.assertEquals("LOCATION", tokens2.get(3).get(CoreAnnotations.NamedEntityTagAnnotation.class));
} }

private static String join(List<CoreLabel> tokens) { private static String join(List<CoreLabel> tokens) {
return StringUtils.join(Iterables.transform(tokens, new Function<CoreLabel, String>() { return StringUtils.join(Iterables.transform(tokens, new Function<CoreLabel, String>() {
public String apply(CoreLabel token) { public String apply(CoreLabel token) {
Expand Down
24 changes: 10 additions & 14 deletions src/edu/stanford/nlp/ling/tokensregex/demo/TokensRegexDemo.java
Expand Up @@ -9,44 +9,41 @@
import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;


import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.List; import java.util.List;


/** /**
* Demo illustrating how to use CoreMapExtractor * Demo illustrating how to use CoreMapExtractor.
*/ */
public class TokensRegexDemo { public class TokensRegexDemo {


public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
PrintWriter out;

String rules; String rules;
if (args.length > 0) { if (args.length > 0) {
rules = args[0]; rules = args[0];
} else { } else {
rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt"; rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt";
} }
PrintWriter out;
if (args.length > 2) { if (args.length > 2) {
out = new PrintWriter(args[2]); out = new PrintWriter(args[2]);
} else { } else {
out = new PrintWriter(System.out); out = new PrintWriter(System.out);
} }


CoreMapExpressionExtractor extractor = CoreMapExpressionExtractor CoreMapExpressionExtractor<MatchedExpression> extractor = CoreMapExpressionExtractor
.createExtractorFromFiles( .createExtractorFromFiles(TokenSequencePattern.getNewEnv(), rules);
TokenSequencePattern.getNewEnv(),
rules);


StanfordCoreNLP pipeline = new StanfordCoreNLP(); StanfordCoreNLP pipeline = new StanfordCoreNLP(
PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
Annotation annotation; Annotation annotation;
if (args.length > 1) { if (args.length > 1) {
annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[1])); annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[1]));
} else { } else {
// annotation = new Annotation("I know Fred has acne. And Wilma has breast cancer.");
annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three"); annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three");

} }


pipeline.annotate(annotation); pipeline.annotate(annotation);
Expand All @@ -60,19 +57,18 @@ public static void main(String[] args) throws IOException {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);


for (CoreMap sentence : sentences) { for (CoreMap sentence : sentences) {
List<MatchedExpression> matchedExpressions = extractor List<MatchedExpression> matchedExpressions = extractor.extractExpressions(sentence);
.extractExpressions(sentence);
for (MatchedExpression matched:matchedExpressions) { for (MatchedExpression matched:matchedExpressions) {
// Print out matched text and value // Print out matched text and value
out.println("matched: " + matched.getText() + " with value " + matched.getValue()); out.println("Matched expression: " + matched.getText() + " with value " + matched.getValue());
// Print out token information // Print out token information
CoreMap cm = matched.getAnnotation(); CoreMap cm = matched.getAnnotation();
for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) { for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) {
String word = token.get(CoreAnnotations.TextAnnotation.class); String word = token.get(CoreAnnotations.TextAnnotation.class);
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
out.println("matched token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne); out.println(" Matched token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne);
} }
} }
} }
Expand Down
@@ -1,15 +1,20 @@
package edu.stanford.nlp.objectbank; package edu.stanford.nlp.objectbank;


import java.io.Serializable;
import java.util.Iterator; import java.util.Iterator;


/** /**
* An IteratorFromReaderFactory is used to convert a java.io.Reader * An IteratorFromReaderFactory is used to convert a java.io.Reader
* into an Iterator over the Objects of type T represented by the text * into an Iterator over the Objects of type T represented by the text
* in the java.io.Reader. * in the java.io.Reader.
* *
* (We have it be Serializable just to avoid non-serializable warnings;
* since implementations of this class normally have no state, they
* should be trivially serializable.)
*
* @author Jenny Finkel * @author Jenny Finkel
*/ */
public interface IteratorFromReaderFactory<T> { public interface IteratorFromReaderFactory<T> extends Serializable {


/** Return an iterator over the contents read from r. /** Return an iterator over the contents read from r.
* *
Expand Down
6 changes: 2 additions & 4 deletions src/edu/stanford/nlp/process/TokenizerFactory.java
Expand Up @@ -5,10 +5,8 @@
import java.io.Reader; import java.io.Reader;


/** /**
* A TokenizerFactory is used to convert a java.io.Reader into a Tokenizer * A TokenizerFactory is a factory that can build a Tokenizer (an extension of Iterator)
* (an extension of Iterator) over objects of type T represented by the text * from a java.io.Reader.
* in the java.io.Reader. It's mainly a convenience, since you could cast
* down anyway.
* *
* <i>IMPORTANT NOTE:</i><br/> * <i>IMPORTANT NOTE:</i><br/>
* *
Expand Down
Expand Up @@ -38,7 +38,7 @@ public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer {
private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*");
private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*");


private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", "true") != null; private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer") != null;


@SuppressWarnings({"NonSerializableFieldInSerializableClass"}) @SuppressWarnings({"NonSerializableFieldInSerializableClass"})
private final TreeTransformer tagExtender; private final TreeTransformer tagExtender;
Expand Down Expand Up @@ -203,7 +203,9 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
} else if (kids.length > 0) { // ROOT has 1 child - the normal case } else if (kids.length > 0) { // ROOT has 1 child - the normal case
Tree child = kids[0]; Tree child = kids[0];
if ( ! child.isPhrasal()) { if ( ! child.isPhrasal()) {
EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING); if (DEBUG) {
EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING);
}
Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids)); Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids));
newTree.setChild(0, added); newTree.setChild(0, added);
} else if (child.label().value().equals("META")) { } else if (child.label().value().equals("META")) {
Expand Down Expand Up @@ -310,6 +312,13 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
} }
} }


// at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree.
if (newTree.numChildren() == 0) {
if (DEBUG) {
EncodingPrintWriter.err.println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.ENCODING);
}
return null;
}


if (tagExtender != null) { if (tagExtender != null) {
newTree = tagExtender.transformTree(newTree); newTree = tagExtender.transformTree(newTree);
Expand Down
Expand Up @@ -93,6 +93,15 @@ public ChineseHeadFinder(TreebankLanguagePack tlp) {


// new for ctb6. // new for ctb6.
nonTerminalInfo.put("FLR", new String[][]{rightExceptPunct}); nonTerminalInfo.put("FLR", new String[][]{rightExceptPunct});

// new for CTB9
nonTerminalInfo.put("DFL", new String[][]{rightExceptPunct});
nonTerminalInfo.put("EMO", new String[][]{leftExceptPunct}); // left/right doesn't matter
nonTerminalInfo.put("INC", new String[][]{leftExceptPunct});
nonTerminalInfo.put("INTJ", new String[][]{leftExceptPunct});
nonTerminalInfo.put("OTH", new String[][]{leftExceptPunct});
nonTerminalInfo.put("SKIP", new String[][]{leftExceptPunct});

} }


private static final long serialVersionUID = 6143632784691159283L; private static final long serialVersionUID = 6143632784691159283L;
Expand Down

0 comments on commit 7b26f83

Please sign in to comment.