Merge branch 'java8' into kbp

stanfordnlp · Mar 12, 2015 · 703ce23 · 703ce23
1 parent a773050
commit 703ce23
Show file tree

Hide file tree

Showing 45 changed files with 950 additions and 301 deletions.
diff --git a/build.xml b/build.xml
@@ -389,5 +389,29 @@
     </war>
   </target>
 
-</project>
+  <property environment="env" />
+
+  <condition property="version1.7">
+    <equals arg1="${ant.java.version}" arg2="1.7" />
+  </condition>
+
+  <target name="tregex-osx" if="version1.7" depends="jar"
+          description="Build an OS X app for TregexGUI">
+    <fail unless="env.JAVA_HOME"
+          message="Environment variable JAVA_HOME not set." />
+
+    <taskdef name="bundleapp"
+             classname="com.oracle.appbundler.AppBundlerTask"
+             classpath="lib/appbundler-1.0.jar" />
+
+    <bundleapp outputdirectory="."
+               name="Tregex GUI"
+               displayname="Tregex GUI"
+               identifier="edu.stanford.nlp.trees.tregex.gui.TregexGUI"
+               mainclassname="edu.stanford.nlp.trees.tregex.gui.TregexGUI">
+      <runtime dir="${env.JAVA_HOME}" />
+      <classpath file="javanlp-core.jar" />
+    </bundleapp>
+  </target>
 
+</project>
diff --git a/itest/src/edu/stanford/nlp/pipeline/PosParserTagCompatibilityITest.java b/itest/src/edu/stanford/nlp/pipeline/PosParserTagCompatibilityITest.java
@@ -0,0 +1,20 @@
+package edu.stanford.nlp.pipeline;
+
+import junit.framework.TestCase;
+
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+
+/**
+ * @author Christopher Manning
+ */
+public class PosParserTagCompatibilityITest extends TestCase {
+
+  public void testEnglishTagSet() {
+    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
+    MaxentTagger tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
+    assertEquals("English (PCFG/left3words) tagger/parser tag set mismatch",
+            lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagger.getTags().tagSet());
+  }
+
+}
diff --git a/itest/src/edu/stanford/nlp/pipeline/StanfordCoreNLPITest.java b/itest/src/edu/stanford/nlp/pipeline/StanfordCoreNLPITest.java
@@ -1,5 +1,7 @@
 package edu.stanford.nlp.pipeline;
 
+import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
+import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.trees.TreeCoreAnnotations;
@@ -173,15 +175,18 @@ public void testRelationExtractor() throws Exception {
     // Check the regexner is integrated with the StanfordCoreNLP
     Properties props = new Properties();
     props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,relation");
-
+    //props.setProperty("sup.relation.model", "/home/sonalg/javanlp/tmp/roth_relation_model_pipeline.ser");
-    String text = "Barack Obama is the 44th President of the United States.  He is the first African American president.";
+    String text = "Barack Obama, a Yale professor, is president.";
     Annotation document = new Annotation(text);
     StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
     pipeline.annotate(document);
-
+    CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
-    StringWriter stringWriter = new StringWriter();
+    List<RelationMention> rel = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
-    pipeline.prettyPrint(document, new PrintWriter(stringWriter));
+    assertEquals(rel.get(0).getType(),"Work_For");
-    String result = stringWriter.getBuffer().toString();
+//    StringWriter stringWriter = new StringWriter();
+//    pipeline.prettyPrint(document, new PrintWriter(stringWriter));
+//    String result = stringWriter.getBuffer().toString();
+//    System.out.println(result);
   }
 
 
@@ -230,7 +235,7 @@ private boolean contains(String string, String regexp) {
     Matcher matcher = pattern.matcher(string);
     return matcher.find();
   }
-
+  
   public void testSerialization() 
     throws Exception
   {

diff --git a/lib/appbundler-1.0.jar b/lib/appbundler-1.0.jar
diff --git a/src/edu/stanford/nlp/classify/ColumnDataClassifier.java b/src/edu/stanford/nlp/classify/ColumnDataClassifier.java
@@ -36,9 +36,13 @@
 import edu.stanford.nlp.ling.BasicDatum;
 import edu.stanford.nlp.ling.Datum;
 import edu.stanford.nlp.ling.RVFDatum;
+import edu.stanford.nlp.ling.Word;
 import edu.stanford.nlp.objectbank.ObjectBank;
 import edu.stanford.nlp.optimization.DiffFunction;
 import edu.stanford.nlp.optimization.Minimizer;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.process.Tokenizer;
+import edu.stanford.nlp.process.TokenizerFactory;
 import edu.stanford.nlp.process.WordShapeClassifier;
 import edu.stanford.nlp.stats.*;
 import edu.stanford.nlp.util.*;
@@ -149,6 +153,7 @@
  * <tr><td> splitWordsRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression on which to split the whole string (as in the String.split() function, which will return the things between delimiters, and discard the delimiters).  The resulting split-up "words" will be used in classifier features iff one of the other "useSplit" options is turned on.</td></tr>
  * <tr><td> splitWordsTokenizerRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression to cut initial pieces off a String.  Either this regular expression or <code>splitWordsIgnoreRegexp</code> <i>should always match</i> the start of the String, and the size of the token is the number of characters matched.  So, for example, one can group letter and number characters but do nothing else with a regular expression like <code>([A-Za-z]+|[0-9]+|.)</code>, where the last disjunct will match any other single character.  (If neither regular expression matches, the first character of the string is treated as a one character word, and then matching is tried again, but in this case a warning message is printed.)  Note that, for Java regular expressions with disjunctions like this, the match is the first matching disjunction, not the longest matching disjunction, so patterns with common prefixes need to be ordered from most specific (longest) to least specific (shortest).)  The resulting split up "words" will be used in classifier features iff one of the other "useSplit" options is turned on.  Note that as usual for Java String processing, backslashes must be doubled in the regular expressions that you write.</td></tr>
  * <tr><td> splitWordsIgnoreRegexp</td><td>String</td><td>\\s+</td><td>If non-empty, this regexp is used to determine character sequences which should not be returned as tokens when using <code>splitWordsTokenizerRegexp</code> or <code>splitWordsRegexp</code>. With the former, first the program attempts to match this regular expression at the start of the string (with <code>lookingAt()</code>) and if it matches, those characters are discarded, but if it doesn't match then <code>splitWordsTokenizerRegexp</code> is tried. With <code>splitWordsRegexp</code>, this is used to filter tokens (with <code>matches()</code> resulting from the splitting.  By default this regular expression is set to be all whitespace tokens (i.e., \\s+). Set it to an empty string to get all tokens returned.</td></tr>
+ * <tr><td> splitWordsWithPTBTokenizer</td><td>boolean</td><td>false</td><td>If true, and <code>splitWordsRegexp</code> and <code>splitWordsTokenizerRegexp</code> are false, then will tokenize using the <code>PTBTokenizer</code></td></tr>
  * <tr><td> useSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp.  Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SW-<i>str</i></td></tr>
  * <tr><td> useLowercaseSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp and then lowercasing the result.  Requires splitWordsRegexp or splitWordsTokenizerRegexp.  Note that this can be specified independently of useSplitWords. You can put either or both original cased and lowercased words in as features.</td><td>SW-<i>str</i></td></tr>
  * <tr><td> useSplitWordPairs</td><td>boolean</td><td>false</td><td>Make features from the pairs of adjacent "words" that are returned by dividing the string into splitWords.  Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SWP-<i>str1</i>-<i>str2</i></td></tr>
@@ -225,6 +230,7 @@ public class ColumnDataClassifier {
   private final Flags[] flags;
   private final Flags globalFlags; // simply points to flags[0]
   private Classifier<String,String> classifier; // really only assigned once too (either in train or load in setProperties)
+  private TokenizerFactory<Word> ptbFactory;
 
 
   /**
@@ -236,7 +242,7 @@ public class ColumnDataClassifier {
    * @return A Datum (may be an RVFDatum; never null)
    */
   public Datum<String,String> makeDatumFromLine(String line) {
-    return makeDatumFromStrings(tab.split(line));
+    return makeDatumFromStrings(splitLineToFields(line));
   }
 
 
@@ -245,7 +251,7 @@ public Datum<String,String> makeDatumFromLine(String line) {
    * If real-valued features are used, this method accesses makeRVFDatumFromLine
    * and returns an RVFDatum; otherwise, categorical features are used.
    *
-   * @param strings The elements that features a made from (the tab-split columns of a TSV file)
+   * @param strings The elements that features are made from (the columns of a TSV/CSV file)
    * @return A Datum (may be an RVFDatum; never null)
    */
   public Datum<String,String> makeDatumFromStrings(String[] strings) {
@@ -846,7 +852,8 @@ private static <F> void addFeature(Object features, F newFeature, double value)
      * @param cWord The String to extract data from
      */
     private void makeDatum(String cWord, Flags flags, Object featuresC, String goldAns) {
-       //System.err.println("Making features for " + cWord + " flags " + flags);
+
+      //System.err.println("Making features for " + cWord + " flags " + flags);
       if (flags == null) {
         // no features for this column
         return;
@@ -918,12 +925,15 @@ private void makeDatum(String cWord, Flags flags, Object featuresC, String goldA
           addFeature(featuresC,featureName,DEFAULT_VALUE);
         }
       }
-      if (flags.splitWordsPattern != null || flags.splitWordsTokenizerPattern != null ) {
+      if (flags.splitWordsPattern != null || flags.splitWordsTokenizerPattern != null ||
+              flags.splitWordsWithPTBTokenizer) {
         String[] bits;
         if (flags.splitWordsTokenizerPattern != null) {
           bits = regexpTokenize(flags.splitWordsTokenizerPattern, flags.splitWordsIgnorePattern, cWord);
-        } else {
+        } else if (flags.splitWordsPattern != null) {
           bits = splitTokenize(flags.splitWordsPattern, flags.splitWordsIgnorePattern, cWord);
+        } else { //PTB tokenizer
+          bits = ptbTokenize(cWord);
         }
         if (flags.showTokenization) {
           System.err.print("Tokenization: ");
@@ -1021,6 +1031,18 @@ private void makeDatum(String cWord, Flags flags, Object featuresC, String goldA
        //System.err.println("Made featuresC " + featuresC);
     }  //end makeDatum
 
+  //return the tokens using PTB tokenizer
+  private String[] ptbTokenize(String cWord) {
+    if(ptbFactory==null)
+      ptbFactory = PTBTokenizer.factory();
+    Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(cWord));
+    List<Word> words = tokenizer.tokenize();
+    String[] res = new String[words.size()];
+    for(int i = 0; i < words.size(); ++i) {
+      res[i] = words.get(i).word();
+    }
+    return res;
+  }
 
   /**
    * Caches a hash of word to all substring features.  A <i>lot</i> of memory!
@@ -1390,8 +1412,9 @@ private Flags[] setProperties(Properties props) {
     for (Enumeration<?> e = props.propertyNames(); e.hasMoreElements();) {
       String key = (String) e.nextElement();
       String val = props.getProperty(key);
+
       int col = 0;  // the default (first after class)
-      // System.err.println(key + " = " + val);
+       System.err.println(key + " = " + val);
       Matcher matcher = prefix.matcher(key);
       if (matcher.matches()) {
         col = Integer.parseInt(matcher.group(1));
@@ -1663,8 +1686,10 @@ private Flags[] setProperties(Properties props) {
       } else if (key.equals("shuffleSeed")) {
         myFlags[col].shuffleSeed = Long.parseLong(val);
       } else if (key.equals("csvFormat")) {
-        myFlags[col].csvFormat=true;
+        myFlags[col].csvFormat= Boolean.parseBoolean(val);
-
+      } else if (key.equals("splitWordsWithPTBTokenizer")) {
+        System.out.println("splitting with ptb tokenizer");
+        myFlags[col].splitWordsWithPTBTokenizer=Boolean.parseBoolean(val);
       } else if ( ! key.isEmpty() && ! key.equals("prop")) {
         System.err.println("Unknown property: |" + key + '|');
       }
@@ -2014,7 +2039,9 @@ static class Flags implements Serializable {
     int crossValidationFolds = -1;
     boolean shuffleTrainingData = false;
     long shuffleSeed = 0;
+
     static boolean csvFormat = false; //train and test files are in csv format
+    boolean splitWordsWithPTBTokenizer = false;
 
     @Override
     public String toString() {

diff --git a/src/edu/stanford/nlp/ie/NERClassifierCombiner.java b/src/edu/stanford/nlp/ie/NERClassifierCombiner.java
@@ -10,6 +10,7 @@
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.PropertiesUtils;
+import edu.stanford.nlp.util.RuntimeInterruptedException;
 import edu.stanford.nlp.util.StringUtils;
 
 /**
@@ -114,6 +115,8 @@ public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokens, fin
         // note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation
         // note: this sets AnswerAnnotation!
         recognizeNumberSequences(output, document, sentence);
+      } catch (RuntimeInterruptedException e) {
+        throw e;
       } catch (Exception e) {
         System.err.println("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)");
         System.err.println("Tokens: " + StringUtils.joinWords(tokens, " "));

diff --git a/src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java b/src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java
@@ -4,9 +4,9 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Properties;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import java.util.regex.Pattern;
 
 import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder;
 import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
@@ -15,6 +15,9 @@
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.Label;
+import edu.stanford.nlp.stats.ClassicCounter;
+import edu.stanford.nlp.stats.Counter;
+import edu.stanford.nlp.stats.Counters;
 import edu.stanford.nlp.trees.TreeCoreAnnotations;
 import edu.stanford.nlp.parser.common.ParserAnnotations;
 import edu.stanford.nlp.parser.common.ParserConstraint;
@@ -135,10 +138,38 @@ public final Annotation parse(String path) throws IOException {
 
     if (preProcessSentences) {
       preProcessSentences(retVal);
+      if(MachineReadingProperties.trainUsePipelineNER){
+        logger.severe("Changing NER tags using the CoreNLP pipeline.");
+        modifyUsingCoreNLPNER(retVal);
+        }
     }
-
     return retVal;
   }
+
+  private void modifyUsingCoreNLPNER(Annotation doc) {
+    Properties ann = new Properties();
+    ann.setProperty("annotators", "pos, lemma, ner");
+    StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);
+    pipeline.annotate(doc);
+    for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
+      List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
+      if (entities != null) {
+        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+        for (EntityMention en : entities) {
+          //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
+          Span s = en.getExtent();
+          Counter<String> allNertagforSpan = new ClassicCounter<String>();
+          for (int i = s.start(); i < s.end(); i++) {
+            allNertagforSpan.incrementCount(tokens.get(i).ner());
+          }
+          String entityNertag = Counters.argmax(allNertagforSpan);
+          en.setType(entityNertag);
+          //System.out.println("new ner tag is " + entityNertag);
+        }
+      }
+
+    }
+  }
 
   public Annotation read(String path) throws Exception {
     return null;
@@ -466,6 +497,8 @@ private static CoreLabel initCoreLabel(String token) {
     label.setWord(token);
     label.setValue(token);
     label.set(CoreAnnotations.TextAnnotation.class, token);
+    label.set(CoreAnnotations.ValueAnnotation.class, token);
+
     return label;
   }
 

diff --git a/src/edu/stanford/nlp/ie/machinereading/MachineReadingProperties.java b/src/edu/stanford/nlp/ie/machinereading/MachineReadingProperties.java
@@ -144,6 +144,9 @@ public class MachineReadingProperties {
 
   @Option(name="loadModel",gloss="if true, load a serialized model rather than training a new one")
   static protected boolean loadModel = false;
+
+  @Option(name="trainUsePipelineNER", gloss="during training, use NER generated by the CoreNLP pipeline")
+  static public boolean trainUsePipelineNER = false;
 
   /**
    * evaluation options (ignored if trainOnly is true)

diff --git a/src/edu/stanford/nlp/ie/machinereading/domains/ace/AceReader.java b/src/edu/stanford/nlp/ie/machinereading/domains/ace/AceReader.java
@@ -250,6 +250,7 @@ private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOEx
       for(int i = 0; i < tokens.size(); i ++){
         CoreLabel l = new CoreLabel();
         l.setWord(tokens.get(i).getLiteral());
+        l.set(CoreAnnotations.ValueAnnotation.class, l.word());
         l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
         l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
         words.add(l);