Skip to content

Commit

Permalink
Integrated doc date in date normalization; added more relative date p…
Browse files Browse the repository at this point in the history
…atterns; added normalization for birth decades
  • Loading branch information
qipeng authored and Stanford NLP committed Sep 17, 2016
1 parent 26754b3 commit 00f80fb
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 20 deletions.
107 changes: 87 additions & 20 deletions src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java
@@ -1,6 +1,7 @@
package edu.stanford.nlp.ie; package edu.stanford.nlp.ie;


import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier; import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.SeqClassifierFlags; import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.ClassicCounter;
Expand Down Expand Up @@ -162,6 +163,9 @@ public class ChineseQuantifiableEntityNormalizer {
+ YEAR_MODIFIER_PATTERN + ")(?:年[份度]?|\\-|/|\\.)?" + "(?:" + BASIC_MMDD_PATTERN + ")?"; + YEAR_MODIFIER_PATTERN + ")(?:年[份度]?|\\-|/|\\.)?" + "(?:" + BASIC_MMDD_PATTERN + ")?";
private static final String ENGLISH_MMDDYYYY_PATTERN = "(\\d{1,2})[/\\-\\.](\\d{1,2})(?:[/\\-\\.](\\d{4}))?"; private static final String ENGLISH_MMDDYYYY_PATTERN = "(\\d{1,2})[/\\-\\.](\\d{1,2})(?:[/\\-\\.](\\d{4}))?";


private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]";
private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后";

/** /**
* Identifies contiguous MONEY, TIME, DATE, or PERCENT entities * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
* and tags each of their constituents with a "normalizedQuantity" * and tags each of their constituents with a "normalizedQuantity"
Expand Down Expand Up @@ -215,10 +219,10 @@ public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E>
// Need different handling for different tags // Need different handling for different tags
switch (prevNerTag) { switch (prevNerTag) {
case TIME_TAG: case TIME_TAG:
// TODO [pengqi]: add TIME // TODO: add TIME
break; break;
case DATE_TAG: case DATE_TAG:
processEntity(collector, prevNerTag, modifier, nextWord); processEntity(collector, prevNerTag, modifier, nextWord, document);
break; break;
default: default:
if(prevNerTag.equals(NUMBER_TAG) || prevNerTag.equals(PERCENT_TAG) || if(prevNerTag.equals(NUMBER_TAG) || prevNerTag.equals(PERCENT_TAG) ||
Expand Down Expand Up @@ -293,6 +297,11 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
return null; return null;
} }


private static <E extends CoreMap> List<E> processEntity(List<E> l,
String entityType, String compModifier, String nextWord) {
return processEntity(l, entityType, compModifier, nextWord, null);
}

/** /**
* Process an entity given the NER tag, extracted modifier and the next word in the document. * Process an entity given the NER tag, extracted modifier and the next word in the document.
* The normalized quantity will be written in place. * The normalized quantity will be written in place.
Expand All @@ -302,11 +311,12 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
* @param compModifier The extracted modifier around the entity of interest. Different NER tags should * @param compModifier The extracted modifier around the entity of interest. Different NER tags should
* have different extraction rules. * have different extraction rules.
* @param nextWord Next word in the document. * @param nextWord Next word in the document.
* @param document Reference to the document.
* @param <E> * @param <E>
* @return * @return
*/ */
private static <E extends CoreMap> List<E> processEntity(List<E> l, private static <E extends CoreMap> List<E> processEntity(List<E> l,
String entityType, String compModifier, String nextWord) { String entityType, String compModifier, String nextWord, CoreMap document) {
if(DEBUG) { if(DEBUG) {
log.info("ChineseQuantifiableEntityNormalizer.processEntity: " + l); log.info("ChineseQuantifiableEntityNormalizer.processEntity: " + l);
} }
Expand Down Expand Up @@ -350,8 +360,10 @@ private static <E extends CoreMap> List<E> processEntity(List<E> l,
break; break;
case DATE_TAG: case DATE_TAG:
if (s.matches(BASIC_YYYYMMDD_PATTERN) || s.matches(BASIC_MMDD_PATTERN) if (s.matches(BASIC_YYYYMMDD_PATTERN) || s.matches(BASIC_MMDD_PATTERN)
|| s.matches(ENGLISH_MMDDYYYY_PATTERN) || s.matches(BASIC_DD_PATTERN)) { || s.matches(ENGLISH_MMDDYYYY_PATTERN) || s.matches(BASIC_DD_PATTERN)
p = normalizeDateString(s, new Date()); // FIXME [pengqi]: Should be using real docdate here || s.matches(RELATIVE_TIME_PATTERN) || s.matches(BIRTH_DECADE_PATTERN)) {
String docdate = document.get(CoreAnnotations.DocDateAnnotation.class);
p = normalizeDateString(s, docdate);
} }
break; break;
case TIME_TAG: case TIME_TAG:
Expand Down Expand Up @@ -667,12 +679,16 @@ private static Double normalizeLiteralDecimalString(String s) {
} }


private static String normalizeMonthOrDay(String s, String context) { private static String normalizeMonthOrDay(String s, String context) {
log.info("NORMALIZING MONTH/DAY: " + s); int ctx = -1;
int ctx = Integer.valueOf(context); if (!context.equals("XX"))
ctx = Integer.valueOf(context);


if (monthDayModifiers.containsKey(s)) { if (monthDayModifiers.containsKey(s)) {
// todo: this is unsafe as it's not bound-checked for validity if (ctx >= 0)
return String.format("%02d", ctx + monthDayModifiers.get(s)); // todo: this is unsafe as it's not bound-checked for validity
return String.format("%02d", ctx + monthDayModifiers.get(s));
else
return "XX";
} else { } else {
String candidate; String candidate;


Expand All @@ -694,12 +710,19 @@ private static String normalizeMonthOrDay(String s, String context) {
} }


private static String normalizeYear(String s, String contextYear) { private static String normalizeYear(String s, String contextYear) {
log.info("NORMALIZING YEAR: " + s); return normalizeYear(s, contextYear, false);
int ctx = Integer.valueOf(contextYear); }


private static String normalizeYear(String s, String contextYear, boolean strict) {
int ctx = -1;
if (!contextYear.equals("XXXX"))
ctx = Integer.valueOf(contextYear);


if (yearModifiers.containsKey(s)) { if (yearModifiers.containsKey(s)) {
return String.format("%d", ctx + yearModifiers.get(s)); if (ctx >= 0)
return String.format("%d", ctx + yearModifiers.get(s));
else
return "XXXX";
} else { } else {
String candidate; String candidate;
StringBuilder yearcandidate = new StringBuilder(); StringBuilder yearcandidate = new StringBuilder();
Expand All @@ -720,13 +743,17 @@ private static String normalizeYear(String s, String contextYear) {
return candidate; return candidate;
} }


if (ctx < 0) {
// use the current year as reference point for two digit year normalization by default
ctx = Integer.valueOf(new SimpleDateFormat("yyyy").format(new Date()));
}

// note: this is a very crude heuristic for determining actual year from two digit expressions // note: this is a very crude heuristic for determining actual year from two digit expressions
int cand = Integer.valueOf(candidate); int cand = Integer.valueOf(candidate);


if (cand > (ctx % 100 + 10)) { if ((strict && cand >= (ctx % 100)) || cand > (ctx % 100 + 10)) {
// referring to the previous century // referring to the previous century
cand += (ctx / 100 - 1) * 100; cand += (ctx / 100 - 1) * 100;

} else { } else {
// referring to the same century // referring to the same century
cand += (ctx / 100) * 100; cand += (ctx / 100) * 100;
Expand All @@ -742,16 +769,56 @@ private static String normalizeYear(String s, String contextYear) {
* @param ctxdate Context date (usually doc_date) * @param ctxdate Context date (usually doc_date)
* @return Normalized Timex expression of the input date string * @return Normalized Timex expression of the input date string
*/ */
public static String normalizeDateString(String s, Date ctxdate) { public static String normalizeDateString(String s, String ctxdate) {
// TODO [pengqi]: need to handle basic localization ("在七月二日到[八日]间") // TODO [pengqi]: need to handle basic localization ("在七月二日到[八日]间")
// TODO [pengqi]: need to handle literal numeral dates (usually used in events, e.g. "三一五" for 03-15) // TODO [pengqi]: need to handle literal numeral dates (usually used in events, e.g. "三一五" for 03-15)
// TODO [pengqi]: might need to add a pattern for centuries ("上世纪90年代")? // TODO [pengqi]: might need to add a pattern for centuries ("上世纪90年代")?
String ctxyear = new SimpleDateFormat("yyyy").format(ctxdate);
String ctxmonth = new SimpleDateFormat("MM").format(ctxdate);
String ctxday = new SimpleDateFormat("dd").format(ctxdate);


Pattern p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$"); Pattern p;
Matcher m = p.matcher(s); Matcher m;
String ctxyear = "XXXX", ctxmonth = "XX", ctxday = "XX";

// set up context date
if (ctxdate != null) {
p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
m = p.matcher(ctxdate);

if (m.find() && m.groupCount() == 3) {
ctxyear = m.group(1);
ctxmonth = m.group(2);
ctxday = m.group(3);
}
}

p = Pattern.compile("^" + BIRTH_DECADE_PATTERN + "$");
m = p.matcher(s);

if (m.find() && m.groupCount() == 1) {
StringBuilder res = new StringBuilder();

res.append(normalizeYear(m.group(1), ctxyear, true).substring(0, 3) + "X");
res.append("-XX-XX");

return res.toString();
}

p = Pattern.compile("^" + RELATIVE_TIME_PATTERN + "$");
m = p.matcher(s);

if (m.find() && m.groupCount() == 1) {
StringBuilder res = new StringBuilder();

res.append(ctxyear);
res.append("-");
res.append(ctxmonth);
res.append("-");
res.append(normalizeMonthOrDay(m.group(1), ctxday));

return res.toString();
}

p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
m = p.matcher(s);


if (m.find() && m.groupCount() == 3) { if (m.find() && m.groupCount() == 3) {
StringBuilder res = new StringBuilder(); StringBuilder res = new StringBuilder();
Expand Down
Expand Up @@ -84,6 +84,7 @@ public ChineseNumberSequenceClassifier(Properties props, boolean useSUTime, Prop
public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+"); public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+");
public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}"); public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}");
public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)"); public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)");
public static final Pattern DATE_PATTERN5 = Pattern.compile("[昨今明][天晨晚夜早]");
public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT


private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+"); private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+");
Expand Down Expand Up @@ -150,6 +151,7 @@ public List<CoreLabel> classify(List<CoreLabel> document) {
DATE_PATTERN2.matcher(me.word()).matches() || DATE_PATTERN2.matcher(me.word()).matches() ||
DATE_PATTERN3.matcher(me.word()).matches() || DATE_PATTERN3.matcher(me.word()).matches() ||
DATE_PATTERN4.matcher(me.word()).matches() || DATE_PATTERN4.matcher(me.word()).matches() ||
DATE_PATTERN5.matcher(me.word()).matches() ||
DATE_WORDS.contains(me.word())) { DATE_WORDS.contains(me.word())) {
me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG); me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
} else if(TIME_PATTERN1.matcher(me.word()).matches() || } else if(TIME_PATTERN1.matcher(me.word()).matches() ||
Expand Down

0 comments on commit 00f80fb

Please sign in to comment.