fix bug with handling of non-conll docs

stanfordnlp · Feb 25, 2018 · 3b605c0 · 3b605c0
1 parent f92e149
commit 3b605c0
Show file tree

Hide file tree

Showing 17 changed files with 181 additions and 229 deletions.
diff --git a/itest/src/edu/stanford/nlp/dcoref/DcorefExactOutputITest.java b/itest/src/edu/stanford/nlp/dcoref/DcorefExactOutputITest.java
@@ -93,7 +93,7 @@ private static Map<Integer, List<ExpectedMention>> loadExpectedResults(String fi
       }
 
       if (id == -1) {
-        id = Integer.parseInt(line);
+        id = Integer.valueOf(line);
       } else {
         mentionLines.add(line.trim());
       }

diff --git a/src/edu/stanford/nlp/coref/data/Dictionaries.java b/src/edu/stanford/nlp/coref/data/Dictionaries.java
@@ -1,4 +1,6 @@
 package edu.stanford.nlp.coref.data;
+import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.util.logging.Redwood;
 
 import java.io.BufferedReader;
 import java.io.File;
@@ -22,20 +24,16 @@
 import edu.stanford.nlp.util.Generics;
 import edu.stanford.nlp.util.Pair;
 import edu.stanford.nlp.util.PropertiesUtils;
-import edu.stanford.nlp.util.StringUtils;
-import edu.stanford.nlp.util.logging.Redwood;
-
 
 /**
  * Stores various data used for coreference.
  * TODO: get rid of dependence on HybridCorefProperties
- *
  * @author Heeyoung Lee
  */
 public class Dictionaries  {
 
   /** A logger for this class */
-  private static final Redwood.RedwoodChannels log = Redwood.channels(Dictionaries.class);
+  private static Redwood.RedwoodChannels log = Redwood.channels(Dictionaries.class);
 
   public enum MentionType {
     PRONOMINAL(1), NOMINAL(3), PROPER(4), LIST(2);
@@ -220,7 +218,9 @@ private void readWordLists(Locale lang) {
   public Counter<String> dictScore = new ClassicCounter<>();
 
   private void setPronouns() {
-    personPronouns.addAll(animatePronouns);
+    for(String s: animatePronouns){
+      personPronouns.add(s);
+    }
 
     allPronouns.addAll(firstPersonPronouns);
     allPronouns.addAll(secondPersonPronouns);
@@ -235,7 +235,7 @@ private void setPronouns() {
    *  The file is cased and checked cased.
    *  The result is: statesAbbreviation is a hash from each abbrev to the fullStateName.
    */
-  private void loadStateAbbreviation(String statesFile) {
+  public void loadStateAbbreviation(String statesFile) {
     BufferedReader reader = null;
     try {
       reader = IOUtils.readerFromString(statesFile);
@@ -272,7 +272,9 @@ public String lookupCanonicalAmericanStateName(String name) {
    *  demonymSet has all country (etc.) names and all demonymic Strings.
    */
   private void loadDemonymLists(String demonymFile) {
-    try (BufferedReader reader = IOUtils.readerFromString(demonymFile)) {
+    BufferedReader reader = null;
+    try {
+      reader = IOUtils.readerFromString(demonymFile);
       for (String line; (line = reader.readLine()) != null; ) {
         line = line.toLowerCase(Locale.ENGLISH);
         String[] tokens = line.split("\t");
@@ -288,6 +290,8 @@ private void loadDemonymLists(String demonymFile) {
       adjectiveNation.removeAll(demonyms.keySet());
     } catch (IOException e) {
       throw new RuntimeIOException(e);
+    } finally {
+      IOUtils.closeIgnoringExceptions(reader);
     }
   }
 
@@ -366,11 +370,11 @@ private void loadCountriesLists(String file) {
       for (String line; (line = reader.readLine()) != null; ) {
         countries.add(line.split("\t")[1].toLowerCase());
       }
+      reader.close();
     } catch (IOException e) {
       throw new RuntimeIOException(e);
     }
   }
-
   /**
    * Load Bergsma and Lin (2006) gender and number list.
    * <br>
@@ -391,7 +395,6 @@ private void loadGenderNumber(String file, String neutralWordsFile) {
     }
   }
 */
-
   /**
    * Load Bergsma and Lin (2006) gender and number list.
    *
@@ -431,11 +434,10 @@ private void loadGenderNumber(String file, String neutralWordsFile) {
       throw new RuntimeIOException(e);
     }
   }
-
+  public void loadChineseGenderNumberAnimacy(String file) {
-  private void loadChineseGenderNumberAnimacy(String file) {
     String[] split = new String[8];
     for (String line : IOUtils.readLines(file)) {
-      if (line.startsWith("#WORD")) continue;    // ignore first row
+      if(line.startsWith("#WORD")) continue;    // ignore first row
       StringUtils.splitOnChar(split, line, '\t');
 
       String word = split[0];

diff --git a/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java b/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java
@@ -563,9 +563,13 @@ protected static void processDiscourse(Document doc, Dictionaries dict) {
     for(Mention m : doc.predictedMentionsByID.values()) {
       String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
       if(debug) log.info("DD: "+speaker);
-      if (NumberMatchingRegex.isDecimalInteger(speaker)) {
+      // if this is not a CoNLL doc, don't treat a number username as a speakerMentionID
-        int speakerMentionID = Integer.parseInt(speaker);
+      // conllDoc == null indicates not a CoNLL doc
-        doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
+      if (doc.conllDoc != null) {
+        if (NumberMatchingRegex.isDecimalInteger(speaker)) {
+          int speakerMentionID = Integer.parseInt(speaker);
+          doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
+        }
       }
     }
 

diff --git a/src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java b/src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java
@@ -366,9 +366,9 @@ public static String initializeAndRunCoref(Properties props) throws Exception {
         CorefMentionFinder mentionFinder;
         if (mentionFinderPropFilename != null) {
           Properties mentionFinderProps = new Properties();
-          try (FileInputStream fis = new FileInputStream(mentionFinderPropFilename)) {
+          FileInputStream fis = new FileInputStream(mentionFinderPropFilename);
-            mentionFinderProps.load(fis);
+          mentionFinderProps.load(fis);
-          }
+          fis.close();
           mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).getConstructor(Properties.class).newInstance(mentionFinderProps);
         } else {
           mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).newInstance();

diff --git a/src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java b/src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java
@@ -1,6 +1,7 @@
 package edu.stanford.nlp.ie;
 
 import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
+import edu.stanford.nlp.ling.CoreAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.sequences.SeqClassifierFlags;
 import edu.stanford.nlp.stats.ClassicCounter;
@@ -180,9 +181,6 @@ public class ChineseQuantifiableEntityNormalizer {
   private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]";
   private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后";
 
-
-  private ChineseQuantifiableEntityNormalizer() { } // static methods
-
   /**
    * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
    * and tags each of their constituents with a "normalizedQuantity"
@@ -711,8 +709,8 @@ private static Double normalizeLiteralDecimalString(String s) {
 
   private static String normalizeMonthOrDay(String s, String context) {
     int ctx = -1;
-    if ( ! context.equals("XX"))
+    if (!context.equals("XX"))
-      ctx = Integer.parseInt(context);
+      ctx = Integer.valueOf(context);
 
     if (monthDayModifiers.containsKey(s)) {
       if (ctx >= 0)
@@ -727,11 +725,10 @@ private static String normalizeMonthOrDay(String s, String context) {
         return "XX";
       } else {
 
-        if (s.matches(CHINESE_DATE_NUMERALS_PATTERN + "+")) {
+        if (s.matches(CHINESE_DATE_NUMERALS_PATTERN + "+"))
           candidate = prettyNumber(String.format("%f", recurNormalizeLiteralIntegerString(s)));
-        } else {
+        else
           candidate = s;
-        }
       }
 
       if (candidate.length() < 2)
@@ -748,7 +745,7 @@ private static String normalizeYear(String s, String contextYear) {
   private static String normalizeYear(String s, String contextYear, boolean strict) {
     int ctx = -1;
     if (!contextYear.equals("XXXX"))
-      ctx = Integer.parseInt(contextYear);
+      ctx = Integer.valueOf(contextYear);
 
     if (yearModifiers.containsKey(s)) {
       if (ctx >= 0)
@@ -759,17 +756,15 @@ private static String normalizeYear(String s, String contextYear, boolean strict
       String candidate;
       StringBuilder yearcandidate = new StringBuilder();
       for (int i = 0; i < s.length(); i++) {
-        String t = String.valueOf(s.charAt(i));
+        String t = "" + s.charAt(i);
         if (CHINESE_LITERAL_DECIMAL_PATTERN.matcher(t).matches()) {
-          if (wordsToValues.containsKey(t)) {
+          if (wordsToValues.containsKey(t))
             yearcandidate.append((int) wordsToValues.getCount(t));
-          } else {
+          else
             // something unexpected happened
             return null;
-          }
+        } else
-        } else {
           yearcandidate.append(t);
-        }
       }
 
       candidate = yearcandidate.toString();
@@ -925,7 +920,7 @@ public static String normalizeDateString(String s, String ctxdate) {
    * @param <E>
    * @return
    */
-  private static <E extends CoreMap> String singleEntityToString(List<E> l) {
+  public static <E extends CoreMap> String singleEntityToString(List<E> l) {
     String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class);
     StringBuilder sb = new StringBuilder();
     for (E w : l) {
@@ -938,11 +933,11 @@ private static <E extends CoreMap> String singleEntityToString(List<E> l) {
     return sb.toString();
   }
 
-  private static String prettyNumber(String s) {
+  public static String prettyNumber(String s) {
-    if (s == null) {
+    if(s == null) {
       return null;
     }
-    s = ! s.contains(".") ? s : s.replaceAll("0*$", "").replaceAll("\\.$", "");
+    s = s.indexOf(".") < 0 ? s : s.replaceAll("0*$", "").replaceAll("\\.$", "");
     return s;
   }
 
@@ -953,7 +948,7 @@ private static String prettyNumber(String s) {
    * @param list
    * @param <E>
    */
-  private static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) {
+  public static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) {
   }
 
 }
diff --git a/src/edu/stanford/nlp/io/IOUtils.java b/src/edu/stanford/nlp/io/IOUtils.java
@@ -273,17 +273,19 @@ public static void writeStringToTempFileNoExceptions(String contents, String pat
    * @return The object read from the file.
    */
   public static <T> T readObjectFromFile(File file) throws IOException,
-          ClassNotFoundException {
+      ClassNotFoundException {
-    try (ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
+    try {
-            new GZIPInputStream(new FileInputStream(file))))) {
+      ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
+          new GZIPInputStream(new FileInputStream(file))));
       Object o = ois.readObject();
+      ois.close();
       return ErasureUtils.uncheckedCast(o);
     } catch (java.util.zip.ZipException e) {
-      try (ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
+      ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
-              new FileInputStream(file)))) {
+          new FileInputStream(file)));
-        Object o = ois.readObject();
+      Object o = ois.readObject();
-        return ErasureUtils.uncheckedCast(o);
+      ois.close();
-      }
+      return ErasureUtils.uncheckedCast(o);
     }
   }
 
@@ -305,10 +307,10 @@ public static DataOutputStream getDataOutputStream(String filename) throws IOExc
    * @return The object read from the file.
    */
   public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) throws IOException, ClassNotFoundException {
-    try (ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename))) {
+    ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename));
-      Object o = ois.readObject();
+    Object o = ois.readObject();
-      return ErasureUtils.uncheckedCast(o);
+    ois.close();
-    }
+    return ErasureUtils.uncheckedCast(o);
   }
 
   public static <T> T readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(Redwood.RedwoodChannels log, String msg, String path) {
@@ -1211,14 +1213,14 @@ public static String slurpURL(URL u, String encoding) throws IOException {
       logger.err(throwableToStackTrace(e));
       return "";
     }
-    try (BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding))) {
+    BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
-      StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
+    StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
-      for (String temp; (temp = br.readLine()) != null; ) {
+    for (String temp; (temp = br.readLine()) != null; ) {
-        buff.append(temp);
+      buff.append(temp);
-        buff.append(lineSeparator);
+      buff.append(lineSeparator);
-      }
-      return buff.toString();
     }
+    br.close();
+    return buff.toString();
   }
 
   public static String getUrlEncoding(URLConnection connection) {
@@ -1240,18 +1242,18 @@ public static String getUrlEncoding(URLConnection connection) {
    * Returns all the text at the given URL.
    */
   public static String slurpURL(URL u) throws IOException {
+    String lineSeparator = System.getProperty("line.separator");
     URLConnection uc = u.openConnection();
     String encoding = getUrlEncoding(uc);
     InputStream is = uc.getInputStream();
-    try (BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding))) {
+    BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
-      StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
+    StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
-      String lineSeparator = System.lineSeparator();
+    for (String temp; (temp = br.readLine()) != null; ) {
-      for (String temp; (temp = br.readLine()) != null; ) {
+      buff.append(temp);
-        buff.append(temp);
+      buff.append(lineSeparator);
-        buff.append(lineSeparator);
-      }
-      return buff.toString();
     }
+    br.close();
+    return buff.toString();
   }
 
   /**
@@ -1673,7 +1675,8 @@ public static String stringFromFile(String filename, String encoding) {
       }
       in.close();
       return sb.toString();
-    } catch (IOException e) {
+    }
+    catch (IOException e) {
       logger.err(throwableToStackTrace(e));
       return null;
     }