Integrated doc date in date normalization; added more relative date p…

…atterns; added normalization for birth decades
stanfordnlp · Sep 17, 2016 · 00f80fb · 00f80fb
1 parent 26754b3
commit 00f80fb
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 20 deletions.
diff --git a/src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java b/src/edu/stanford/nlp/ie/ChineseQuantifiableEntityNormalizer.java
@@ -1,6 +1,7 @@
 package edu.stanford.nlp.ie;
 
 import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
+import edu.stanford.nlp.ling.CoreAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.sequences.SeqClassifierFlags;
 import edu.stanford.nlp.stats.ClassicCounter;
@@ -162,6 +163,9 @@ public class ChineseQuantifiableEntityNormalizer {
           + YEAR_MODIFIER_PATTERN + ")(?:年[份度]?|\\-|/|\\.)?" + "(?:" + BASIC_MMDD_PATTERN + ")?";
   private static final String ENGLISH_MMDDYYYY_PATTERN = "(\\d{1,2})[/\\-\\.](\\d{1,2})(?:[/\\-\\.](\\d{4}))?";
 
+  private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]";
+  private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后";
+
   /**
    * Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
    * and tags each of their constituents with a "normalizedQuantity"
@@ -215,10 +219,10 @@ public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E>
         // Need different handling for different tags
         switch (prevNerTag) {
           case TIME_TAG:
-            // TODO [pengqi]: add TIME
+            // TODO: add TIME
             break;
           case DATE_TAG:
-            processEntity(collector, prevNerTag, modifier, nextWord);
+            processEntity(collector, prevNerTag, modifier, nextWord, document);
             break;
           default:
             if(prevNerTag.equals(NUMBER_TAG) || prevNerTag.equals(PERCENT_TAG) ||
@@ -293,6 +297,11 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
     return null;
   }
 
+  private static <E extends CoreMap> List<E> processEntity(List<E> l,
+           String entityType, String compModifier, String nextWord) {
+    return processEntity(l, entityType, compModifier, nextWord, null);
+  }
+
   /**
    * Process an entity given the NER tag, extracted modifier and the next word in the document.
    * The normalized quantity will be written in place.
@@ -302,11 +311,12 @@ private static <E extends CoreMap> String detectQuantityModifier(List<E> list, i
    * @param compModifier The extracted modifier around the entity of interest. Different NER tags should
    *                    have different extraction rules.
    * @param nextWord Next word in the document.
+   * @param document Reference to the document.
    * @param <E>
    * @return
    */
   private static <E extends CoreMap> List<E> processEntity(List<E> l,
-            String entityType, String compModifier, String nextWord) {
+            String entityType, String compModifier, String nextWord, CoreMap document) {
     if(DEBUG) {
       log.info("ChineseQuantifiableEntityNormalizer.processEntity: " + l);
     }
@@ -350,8 +360,10 @@ private static <E extends CoreMap> List<E> processEntity(List<E> l,
         break;
       case DATE_TAG:
         if (s.matches(BASIC_YYYYMMDD_PATTERN) || s.matches(BASIC_MMDD_PATTERN)
-                || s.matches(ENGLISH_MMDDYYYY_PATTERN) || s.matches(BASIC_DD_PATTERN)) {
+                || s.matches(ENGLISH_MMDDYYYY_PATTERN) || s.matches(BASIC_DD_PATTERN)
-          p = normalizeDateString(s, new Date());   // FIXME [pengqi]: Should be using real docdate here
+                || s.matches(RELATIVE_TIME_PATTERN) || s.matches(BIRTH_DECADE_PATTERN)) {
+          String docdate = document.get(CoreAnnotations.DocDateAnnotation.class);
+          p = normalizeDateString(s, docdate);
         }
         break;
       case TIME_TAG:
@@ -667,12 +679,16 @@ private static Double normalizeLiteralDecimalString(String s) {
   }
 
   private static String normalizeMonthOrDay(String s, String context) {
-    log.info("NORMALIZING MONTH/DAY: " + s);
+    int ctx = -1;
-    int ctx = Integer.valueOf(context);
+    if (!context.equals("XX"))
+      ctx = Integer.valueOf(context);
 
     if (monthDayModifiers.containsKey(s)) {
-      // todo: this is unsafe as it's not bound-checked for validity
+      if (ctx >= 0)
-      return String.format("%02d", ctx + monthDayModifiers.get(s));
+        // todo: this is unsafe as it's not bound-checked for validity
+        return String.format("%02d", ctx + monthDayModifiers.get(s));
+      else
+        return "XX";
     } else {
       String candidate;
 
@@ -694,12 +710,19 @@ private static String normalizeMonthOrDay(String s, String context) {
   }
 
   private static String normalizeYear(String s, String contextYear) {
-    log.info("NORMALIZING YEAR: " + s);
+    return normalizeYear(s, contextYear, false);
-    int ctx = Integer.valueOf(contextYear);
+  }
 
+  private static String normalizeYear(String s, String contextYear, boolean strict) {
+    int ctx = -1;
+    if (!contextYear.equals("XXXX"))
+      ctx = Integer.valueOf(contextYear);
 
     if (yearModifiers.containsKey(s)) {
-      return String.format("%d", ctx + yearModifiers.get(s));
+      if (ctx >= 0)
+        return String.format("%d", ctx + yearModifiers.get(s));
+      else
+        return "XXXX";
     } else {
       String candidate;
       StringBuilder yearcandidate = new StringBuilder();
@@ -720,13 +743,17 @@ private static String normalizeYear(String s, String contextYear) {
         return candidate;
       }
 
+      if (ctx < 0) {
+        // use the current year as reference point for two digit year normalization by default
+        ctx = Integer.valueOf(new SimpleDateFormat("yyyy").format(new Date()));
+      }
+
       // note: this is a very crude heuristic for determining actual year from two digit expressions
       int cand = Integer.valueOf(candidate);
 
-      if (cand > (ctx % 100 + 10)) {
+      if ((strict && cand >= (ctx % 100)) || cand > (ctx % 100 + 10)) {
         // referring to the previous century
         cand += (ctx / 100 - 1) * 100;
-
       } else {
         // referring to the same century
         cand += (ctx / 100) * 100;
@@ -742,16 +769,56 @@ private static String normalizeYear(String s, String contextYear) {
    * @param ctxdate Context date (usually doc_date)
    * @return Normalized Timex expression of the input date string
      */
-  public static String normalizeDateString(String s, Date ctxdate) {
+  public static String normalizeDateString(String s, String ctxdate) {
     // TODO [pengqi]: need to handle basic localization ("在七月二日到[八日]间")
     // TODO [pengqi]: need to handle literal numeral dates (usually used in events, e.g. "三一五" for 03-15)
     // TODO [pengqi]: might need to add a pattern for centuries ("上世纪90年代")?
-    String ctxyear = new SimpleDateFormat("yyyy").format(ctxdate);
-    String ctxmonth = new SimpleDateFormat("MM").format(ctxdate);
-    String ctxday = new SimpleDateFormat("dd").format(ctxdate);
 
-    Pattern p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
+    Pattern p;
-    Matcher m = p.matcher(s);
+    Matcher m;
+    String ctxyear = "XXXX", ctxmonth = "XX", ctxday = "XX";
+
+    // set up context date
+    if (ctxdate != null) {
+      p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
+      m = p.matcher(ctxdate);
+
+      if (m.find() && m.groupCount() == 3) {
+        ctxyear = m.group(1);
+        ctxmonth = m.group(2);
+        ctxday = m.group(3);
+      }
+    }
+
+    p = Pattern.compile("^" + BIRTH_DECADE_PATTERN + "$");
+    m = p.matcher(s);
+
+    if (m.find() && m.groupCount() == 1) {
+      StringBuilder res = new StringBuilder();
+
+      res.append(normalizeYear(m.group(1), ctxyear, true).substring(0, 3) + "X");
+      res.append("-XX-XX");
+
+      return res.toString();
+    }
+
+    p = Pattern.compile("^" + RELATIVE_TIME_PATTERN + "$");
+    m = p.matcher(s);
+
+    if (m.find() && m.groupCount() == 1) {
+      StringBuilder res = new StringBuilder();
+
+      res.append(ctxyear);
+      res.append("-");
+      res.append(ctxmonth);
+      res.append("-");
+      res.append(normalizeMonthOrDay(m.group(1), ctxday));
+
+      return res.toString();
+    }
+
+    p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
+    m = p.matcher(s);
 
     if (m.find() && m.groupCount() == 3) {
       StringBuilder res = new StringBuilder();

diff --git a/src/edu/stanford/nlp/ie/regexp/ChineseNumberSequenceClassifier.java b/src/edu/stanford/nlp/ie/regexp/ChineseNumberSequenceClassifier.java
@@ -84,6 +84,7 @@ public ChineseNumberSequenceClassifier(Properties props, boolean useSUTime, Prop
   public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+");
   public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}");
   public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)");
+  public static final Pattern DATE_PATTERN5 = Pattern.compile("[昨今明][天晨晚夜早]");
   public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT
 
   private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+");
@@ -150,6 +151,7 @@ public List<CoreLabel> classify(List<CoreLabel> document) {
             DATE_PATTERN2.matcher(me.word()).matches() ||
             DATE_PATTERN3.matcher(me.word()).matches() ||
             DATE_PATTERN4.matcher(me.word()).matches() ||
+            DATE_PATTERN5.matcher(me.word()).matches() ||
             DATE_WORDS.contains(me.word())) {
           me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
         } else if(TIME_PATTERN1.matcher(me.word()).matches() ||