Skip to content

Commit

Permalink
fix bug with handling of non-conll docs
Browse files Browse the repository at this point in the history
  • Loading branch information
J38 authored and Stanford NLP committed Feb 25, 2018
1 parent f92e149 commit 3b605c0
Show file tree
Hide file tree
Showing 17 changed files with 181 additions and 229 deletions.
Expand Up @@ -93,7 +93,7 @@ private static Map<Integer, List<ExpectedMention>> loadExpectedResults(String fi
} }


if (id == -1) { if (id == -1) {
id = Integer.parseInt(line); id = Integer.valueOf(line);
} else { } else {
mentionLines.add(line.trim()); mentionLines.add(line.trim());
} }
Expand Down
28 changes: 15 additions & 13 deletions src/edu/stanford/nlp/coref/data/Dictionaries.java
@@ -1,4 +1,6 @@
package edu.stanford.nlp.coref.data; package edu.stanford.nlp.coref.data;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;


import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
Expand All @@ -22,20 +24,16 @@
import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;



/** /**
* Stores various data used for coreference. * Stores various data used for coreference.
* TODO: get rid of dependence on HybridCorefProperties * TODO: get rid of dependence on HybridCorefProperties
*
* @author Heeyoung Lee * @author Heeyoung Lee
*/ */
public class Dictionaries { public class Dictionaries {


/** A logger for this class */ /** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(Dictionaries.class); private static Redwood.RedwoodChannels log = Redwood.channels(Dictionaries.class);


public enum MentionType { public enum MentionType {
PRONOMINAL(1), NOMINAL(3), PROPER(4), LIST(2); PRONOMINAL(1), NOMINAL(3), PROPER(4), LIST(2);
Expand Down Expand Up @@ -220,7 +218,9 @@ private void readWordLists(Locale lang) {
public Counter<String> dictScore = new ClassicCounter<>(); public Counter<String> dictScore = new ClassicCounter<>();


private void setPronouns() { private void setPronouns() {
personPronouns.addAll(animatePronouns); for(String s: animatePronouns){
personPronouns.add(s);
}


allPronouns.addAll(firstPersonPronouns); allPronouns.addAll(firstPersonPronouns);
allPronouns.addAll(secondPersonPronouns); allPronouns.addAll(secondPersonPronouns);
Expand All @@ -235,7 +235,7 @@ private void setPronouns() {
* The file is cased and checked cased. * The file is cased and checked cased.
* The result is: statesAbbreviation is a hash from each abbrev to the fullStateName. * The result is: statesAbbreviation is a hash from each abbrev to the fullStateName.
*/ */
private void loadStateAbbreviation(String statesFile) { public void loadStateAbbreviation(String statesFile) {
BufferedReader reader = null; BufferedReader reader = null;
try { try {
reader = IOUtils.readerFromString(statesFile); reader = IOUtils.readerFromString(statesFile);
Expand Down Expand Up @@ -272,7 +272,9 @@ public String lookupCanonicalAmericanStateName(String name) {
* demonymSet has all country (etc.) names and all demonymic Strings. * demonymSet has all country (etc.) names and all demonymic Strings.
*/ */
private void loadDemonymLists(String demonymFile) { private void loadDemonymLists(String demonymFile) {
try (BufferedReader reader = IOUtils.readerFromString(demonymFile)) { BufferedReader reader = null;
try {
reader = IOUtils.readerFromString(demonymFile);
for (String line; (line = reader.readLine()) != null; ) { for (String line; (line = reader.readLine()) != null; ) {
line = line.toLowerCase(Locale.ENGLISH); line = line.toLowerCase(Locale.ENGLISH);
String[] tokens = line.split("\t"); String[] tokens = line.split("\t");
Expand All @@ -288,6 +290,8 @@ private void loadDemonymLists(String demonymFile) {
adjectiveNation.removeAll(demonyms.keySet()); adjectiveNation.removeAll(demonyms.keySet());
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeIOException(e); throw new RuntimeIOException(e);
} finally {
IOUtils.closeIgnoringExceptions(reader);
} }
} }


Expand Down Expand Up @@ -366,11 +370,11 @@ private void loadCountriesLists(String file) {
for (String line; (line = reader.readLine()) != null; ) { for (String line; (line = reader.readLine()) != null; ) {
countries.add(line.split("\t")[1].toLowerCase()); countries.add(line.split("\t")[1].toLowerCase());
} }
reader.close();
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeIOException(e); throw new RuntimeIOException(e);
} }
} }

/** /**
* Load Bergsma and Lin (2006) gender and number list. * Load Bergsma and Lin (2006) gender and number list.
* <br> * <br>
Expand All @@ -391,7 +395,6 @@ private void loadGenderNumber(String file, String neutralWordsFile) {
} }
} }
*/ */

/** /**
* Load Bergsma and Lin (2006) gender and number list. * Load Bergsma and Lin (2006) gender and number list.
* *
Expand Down Expand Up @@ -431,11 +434,10 @@ private void loadGenderNumber(String file, String neutralWordsFile) {
throw new RuntimeIOException(e); throw new RuntimeIOException(e);
} }
} }

public void loadChineseGenderNumberAnimacy(String file) {
private void loadChineseGenderNumberAnimacy(String file) {
String[] split = new String[8]; String[] split = new String[8];
for (String line : IOUtils.readLines(file)) { for (String line : IOUtils.readLines(file)) {
if (line.startsWith("#WORD")) continue; // ignore first row if(line.startsWith("#WORD")) continue; // ignore first row
StringUtils.splitOnChar(split, line, '\t'); StringUtils.splitOnChar(split, line, '\t');


String word = split[0]; String word = split[0];
Expand Down
10 changes: 7 additions & 3 deletions src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java
Expand Up @@ -563,9 +563,13 @@ protected static void processDiscourse(Document doc, Dictionaries dict) {
for(Mention m : doc.predictedMentionsByID.values()) { for(Mention m : doc.predictedMentionsByID.values()) {
String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class); String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
if(debug) log.info("DD: "+speaker); if(debug) log.info("DD: "+speaker);
if (NumberMatchingRegex.isDecimalInteger(speaker)) { // if this is not a CoNLL doc, don't treat a number username as a speakerMentionID
int speakerMentionID = Integer.parseInt(speaker); // conllDoc == null indicates not a CoNLL doc
doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID)); if (doc.conllDoc != null) {
if (NumberMatchingRegex.isDecimalInteger(speaker)) {
int speakerMentionID = Integer.parseInt(speaker);
doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
}
} }
} }


Expand Down
6 changes: 3 additions & 3 deletions src/edu/stanford/nlp/dcoref/SieveCoreferenceSystem.java
Expand Up @@ -366,9 +366,9 @@ public static String initializeAndRunCoref(Properties props) throws Exception {
CorefMentionFinder mentionFinder; CorefMentionFinder mentionFinder;
if (mentionFinderPropFilename != null) { if (mentionFinderPropFilename != null) {
Properties mentionFinderProps = new Properties(); Properties mentionFinderProps = new Properties();
try (FileInputStream fis = new FileInputStream(mentionFinderPropFilename)) { FileInputStream fis = new FileInputStream(mentionFinderPropFilename);
mentionFinderProps.load(fis); mentionFinderProps.load(fis);
} fis.close();
mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).getConstructor(Properties.class).newInstance(mentionFinderProps); mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).getConstructor(Properties.class).newInstance(mentionFinderProps);
} else { } else {
mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).newInstance(); mentionFinder = (CorefMentionFinder) Class.forName(mentionFinderClass).newInstance();
Expand Down
35 changes: 15 additions & 20 deletions src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java
@@ -1,6 +1,7 @@
package edu.stanford.nlp.ie; package edu.stanford.nlp.ie;


import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier; import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.SeqClassifierFlags; import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.ClassicCounter;
Expand Down Expand Up @@ -180,9 +181,6 @@ public class ChineseQuantifiableEntityNormalizer {
private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]"; private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]";
private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后"; private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后";



private ChineseQuantifiableEntityNormalizer() { } // static methods

/** /**
* Identifies contiguous MONEY, TIME, DATE, or PERCENT entities * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
* and tags each of their constituents with a "normalizedQuantity" * and tags each of their constituents with a "normalizedQuantity"
Expand Down Expand Up @@ -711,8 +709,8 @@ private static Double normalizeLiteralDecimalString(String s) {


private static String normalizeMonthOrDay(String s, String context) { private static String normalizeMonthOrDay(String s, String context) {
int ctx = -1; int ctx = -1;
if ( ! context.equals("XX")) if (!context.equals("XX"))
ctx = Integer.parseInt(context); ctx = Integer.valueOf(context);


if (monthDayModifiers.containsKey(s)) { if (monthDayModifiers.containsKey(s)) {
if (ctx >= 0) if (ctx >= 0)
Expand All @@ -727,11 +725,10 @@ private static String normalizeMonthOrDay(String s, String context) {
return "XX"; return "XX";
} else { } else {


if (s.matches(CHINESE_DATE_NUMERALS_PATTERN + "+")) { if (s.matches(CHINESE_DATE_NUMERALS_PATTERN + "+"))
candidate = prettyNumber(String.format("%f", recurNormalizeLiteralIntegerString(s))); candidate = prettyNumber(String.format("%f", recurNormalizeLiteralIntegerString(s)));
} else { else
candidate = s; candidate = s;
}
} }


if (candidate.length() < 2) if (candidate.length() < 2)
Expand All @@ -748,7 +745,7 @@ private static String normalizeYear(String s, String contextYear) {
private static String normalizeYear(String s, String contextYear, boolean strict) { private static String normalizeYear(String s, String contextYear, boolean strict) {
int ctx = -1; int ctx = -1;
if (!contextYear.equals("XXXX")) if (!contextYear.equals("XXXX"))
ctx = Integer.parseInt(contextYear); ctx = Integer.valueOf(contextYear);


if (yearModifiers.containsKey(s)) { if (yearModifiers.containsKey(s)) {
if (ctx >= 0) if (ctx >= 0)
Expand All @@ -759,17 +756,15 @@ private static String normalizeYear(String s, String contextYear, boolean strict
String candidate; String candidate;
StringBuilder yearcandidate = new StringBuilder(); StringBuilder yearcandidate = new StringBuilder();
for (int i = 0; i < s.length(); i++) { for (int i = 0; i < s.length(); i++) {
String t = String.valueOf(s.charAt(i)); String t = "" + s.charAt(i);
if (CHINESE_LITERAL_DECIMAL_PATTERN.matcher(t).matches()) { if (CHINESE_LITERAL_DECIMAL_PATTERN.matcher(t).matches()) {
if (wordsToValues.containsKey(t)) { if (wordsToValues.containsKey(t))
yearcandidate.append((int) wordsToValues.getCount(t)); yearcandidate.append((int) wordsToValues.getCount(t));
} else { else
// something unexpected happened // something unexpected happened
return null; return null;
} } else
} else {
yearcandidate.append(t); yearcandidate.append(t);
}
} }


candidate = yearcandidate.toString(); candidate = yearcandidate.toString();
Expand Down Expand Up @@ -925,7 +920,7 @@ public static String normalizeDateString(String s, String ctxdate) {
* @param <E> * @param <E>
* @return * @return
*/ */
private static <E extends CoreMap> String singleEntityToString(List<E> l) { public static <E extends CoreMap> String singleEntityToString(List<E> l) {
String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class); String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class);
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (E w : l) { for (E w : l) {
Expand All @@ -938,11 +933,11 @@ private static <E extends CoreMap> String singleEntityToString(List<E> l) {
return sb.toString(); return sb.toString();
} }


private static String prettyNumber(String s) { public static String prettyNumber(String s) {
if (s == null) { if(s == null) {
return null; return null;
} }
s = ! s.contains(".") ? s : s.replaceAll("0*$", "").replaceAll("\\.$", ""); s = s.indexOf(".") < 0 ? s : s.replaceAll("0*$", "").replaceAll("\\.$", "");
return s; return s;
} }


Expand All @@ -953,7 +948,7 @@ private static String prettyNumber(String s) {
* @param list * @param list
* @param <E> * @param <E>
*/ */
private static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) { public static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) {
} }


} }
59 changes: 31 additions & 28 deletions src/edu/stanford/nlp/io/IOUtils.java
Expand Up @@ -273,17 +273,19 @@ public static void writeStringToTempFileNoExceptions(String contents, String pat
* @return The object read from the file. * @return The object read from the file.
*/ */
public static <T> T readObjectFromFile(File file) throws IOException, public static <T> T readObjectFromFile(File file) throws IOException,
ClassNotFoundException { ClassNotFoundException {
try (ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( try {
new GZIPInputStream(new FileInputStream(file))))) { ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
new GZIPInputStream(new FileInputStream(file))));
Object o = ois.readObject(); Object o = ois.readObject();
ois.close();
return ErasureUtils.uncheckedCast(o); return ErasureUtils.uncheckedCast(o);
} catch (java.util.zip.ZipException e) { } catch (java.util.zip.ZipException e) {
try (ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
new FileInputStream(file)))) { new FileInputStream(file)));
Object o = ois.readObject(); Object o = ois.readObject();
return ErasureUtils.uncheckedCast(o); ois.close();
} return ErasureUtils.uncheckedCast(o);
} }
} }


Expand All @@ -305,10 +307,10 @@ public static DataOutputStream getDataOutputStream(String filename) throws IOExc
* @return The object read from the file. * @return The object read from the file.
*/ */
public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) throws IOException, ClassNotFoundException { public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) throws IOException, ClassNotFoundException {
try (ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename))) { ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename));
Object o = ois.readObject(); Object o = ois.readObject();
return ErasureUtils.uncheckedCast(o); ois.close();
} return ErasureUtils.uncheckedCast(o);
} }


public static <T> T readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(Redwood.RedwoodChannels log, String msg, String path) { public static <T> T readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(Redwood.RedwoodChannels log, String msg, String path) {
Expand Down Expand Up @@ -1211,14 +1213,14 @@ public static String slurpURL(URL u, String encoding) throws IOException {
logger.err(throwableToStackTrace(e)); logger.err(throwableToStackTrace(e));
return ""; return "";
} }
try (BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding))) { BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
for (String temp; (temp = br.readLine()) != null; ) { for (String temp; (temp = br.readLine()) != null; ) {
buff.append(temp); buff.append(temp);
buff.append(lineSeparator); buff.append(lineSeparator);
}
return buff.toString();
} }
br.close();
return buff.toString();
} }


public static String getUrlEncoding(URLConnection connection) { public static String getUrlEncoding(URLConnection connection) {
Expand All @@ -1240,18 +1242,18 @@ public static String getUrlEncoding(URLConnection connection) {
* Returns all the text at the given URL. * Returns all the text at the given URL.
*/ */
public static String slurpURL(URL u) throws IOException { public static String slurpURL(URL u) throws IOException {
String lineSeparator = System.getProperty("line.separator");
URLConnection uc = u.openConnection(); URLConnection uc = u.openConnection();
String encoding = getUrlEncoding(uc); String encoding = getUrlEncoding(uc);
InputStream is = uc.getInputStream(); InputStream is = uc.getInputStream();
try (BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding))) { BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish StringBuilder buff = new StringBuilder(SLURP_BUFFER_SIZE); // make biggish
String lineSeparator = System.lineSeparator(); for (String temp; (temp = br.readLine()) != null; ) {
for (String temp; (temp = br.readLine()) != null; ) { buff.append(temp);
buff.append(temp); buff.append(lineSeparator);
buff.append(lineSeparator);
}
return buff.toString();
} }
br.close();
return buff.toString();
} }


/** /**
Expand Down Expand Up @@ -1673,7 +1675,8 @@ public static String stringFromFile(String filename, String encoding) {
} }
in.close(); in.close();
return sb.toString(); return sb.toString();
} catch (IOException e) { }
catch (IOException e) {
logger.err(throwableToStackTrace(e)); logger.err(throwableToStackTrace(e));
return null; return null;
} }
Expand Down

0 comments on commit 3b605c0

Please sign in to comment.