Merge pull request #33 from leungmanhin/ghost-dot

Fix issues with dots and numbers in Ghost
singnet · Oct 25, 2018 · 5bd74e1 · 5bd74e1
2 parents 989a24f + 66ac466
commit 5bd74e1
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 34 deletions.
diff --git a/opencog/ghost/cs-parse.scm b/opencog/ghost/cs-parse.scm
@@ -143,10 +143,20 @@
     ((has-match? "!" str) (result:suffix 'NOT location #f))
     ((has-match? "[?]" str) (result:suffix '? location "?"))
     ((has-match? "=" str) (result:suffix 'EQUAL location #f))
+    ; For time -- a.m. and p.m.
+    ; Just catch it to avoid it being splitted into multiple words
+    ; Should be done before any literal / lemma matching
+    ((has-match? "[ap]\\.m\\." str)
+      (result:suffix 'STRING location
+        (string-trim-both (match:substring current-match))))
     ; Words with apostrophe, e.g. I'm, it's etc
     ((has-match? "[a-zA-Z]+['’][a-zA-Z]+" str)
       (result:suffix 'LITERAL_APOS location
         (string-trim-both (match:substring current-match))))
+    ; Literals for example: Mr. Dr. etc
+    ((has-match? "[a-zA-Z]+\\." str)
+      (result:suffix 'LITERAL location
+        (string-trim-both (match:substring current-match))))
     ; Literals -- words start with a '
     ((has-match? "'[a-zA-Z]+\\b" str)
       (result:suffix 'LITERAL location
@@ -158,15 +168,15 @@
         ; Literals, words in the pattern that are not in their canonical forms
         (result:suffix 'LITERAL location
           (string-trim-both (match:substring current-match)))))
-    ((has-match? "[0-9]+[0-9.]*" str)
+    ((has-match? "[0-9]+[0-9.]*\\b" str)
       (result:suffix 'NUM location
         (string-trim-both (match:substring current-match))))
     ((has-match? "[|]" str)
       (result:suffix 'VLINE location
-         (string-trim-both (match:substring current-match))))
+        (string-trim-both (match:substring current-match))))
     ((has-match? "," str)
       (result:suffix 'COMMA location
-         (string-trim-both (match:substring current-match))))
+        (string-trim-both (match:substring current-match))))
     ; This should always be near the end, because it is broadest of all.
     ((has-match? "[~’'._!?0-9a-zA-Z-]+" str)
       (result:suffix 'STRING location
@@ -507,6 +517,7 @@
 
     (literal
       (LITERAL) : (format #f "(cons 'word \"~a\")" $1)
+      (NUM) : (format #f "(cons 'word \"~a\")" $1)
       (STRING) : (format #f "(cons 'word \"~a\")" $1)
     )
 

diff --git a/opencog/ghost/terms.scm b/opencog/ghost/terms.scm
@@ -6,14 +6,29 @@
 "
   Occurrence of a word, a word that should be matched literally.
 "
-  (let* ((str-dc (if (string=? STR "I") STR (string-downcase STR)))
-         (v1 (WordNode str-dc))
-         (v2 (Variable (gen-var str-dc #f)))
-         (l (WordNode (get-lemma str-dc)))
-         (v (list (TypedVariable v2 (Type "WordInstanceNode"))))
-         (c (list (WordInstanceLink v2 (Variable "$P"))
-                  (ReferenceLink v2 v1))))
-    (list v c (list v1) (list l))))
+  (let* ((str-dc (string-downcase STR))
+         ; Special handling for time that's written as a single word
+         ; e.g. 2pm, 10am etc
+         ; For the input, regardless of the format, e.g. "2am", "2 am",
+         ; "2 a.m." or "2a.m.", will all get splitted into two words
+         (is-time? (string-match "[0-9]{1,2}[ ]*[ap][.]*m[.]*" str-dc))
+         (is-two-digits?
+           (and is-time? (char-numeric? (string-ref str-dc 1))))
+         (time-1pt
+           (if is-two-digits?
+             (substring str-dc 0 2)
+             (substring str-dc 0 1)))
+         (time-2pt
+           (if is-two-digits?
+             (substring str-dc 2)
+             (substring str-dc 1))))
+    (list (list) (list)
+      (if is-time?
+        (list (WordNode time-1pt) (WordNode time-2pt))
+        (list (WordNode str-dc)))
+      (if is-time?
+        (list (WordNode time-1pt) (WordNode time-2pt))
+        (list (WordNode (get-lemma str-dc)))))))
 
 ; ----------
 (define (word-apos STR)
@@ -23,33 +38,28 @@
 "
   (let* (; This turns ’ into ' just to treat them as the same thing
          (nstr (regexp-substitute/global #f "’" STR 'pre "'" 'post))
-         (l (WordNode
-              (if (string-prefix? "I'" nstr) nstr (string-downcase nstr)))))
-    (list (list) (list) (list l) (list l))))
+         (w (WordNode (string-downcase nstr))))
+    (list (list) (list) (list w) (list w))))
 
 ; ----------
 (define (lemma STR)
 "
   Lemma occurrence, aka canonical form of a term.
   This is the default for word mentions in the rule pattern.
 "
-  (let* ((str-dc (if (string=? STR "I") STR (string-downcase STR)))
-         (v1 (Variable (gen-var str-dc #t)))
-         (v2 (Variable (gen-var str-dc #f)))
+  (let* ((str-dc (string-downcase STR))
+         (var (Variable (gen-var str-dc #t)))
          (l (WordNode (get-lemma str-dc)))
-         (v (list (TypedVariable v1 (Type "WordNode"))
-                  (TypedVariable v2 (Type "WordInstanceNode"))))
-         (c (list (ReferenceLink v2 v1)
-                  ; In some rare situation, particularly if the input
+         (v (list (TypedVariable var (Type "WordNode"))))
+         (c (list ; In some rare situation, particularly if the input
                   ; sentence is not grammatical, RelEx may not lemmatize a
                   ; word because of the ambiguity
-                  ; So just to be sure "l" is the stem of "v1",
+                  ; So just to be sure "l" is the stem of "var",
                   ; a GroundedPredicateNode is used instead of putting
                   ; "(LemmaLink v2 l)" in the context
                   (Evaluation (GroundedPredicate "scm: ghost-lemma?")
-                              (List v1 l))
-                  (WordInstanceLink v2 (Variable "$P")))))
-    (list v c (list v1) (list l))))
+                              (List var l)))))
+    (list v c (list var) (list l))))
 
 (define-public (ghost-lemma? GRD LEMMA)
 "
@@ -557,7 +567,7 @@
     ((string=? "equal" OPERATOR)
      (if both-numbers?
        (= lv-num rv-num)
-       (string=? lv-str rv-str)))
+       (string-ci=? lv-str rv-str)))
     ((string=? "smaller" OPERATOR)
      (and both-numbers?
           (< lv-num rv-num)))

diff --git a/opencog/ghost/utils.scm b/opencog/ghost/utils.scm
@@ -122,15 +122,26 @@
            (set! final-word-seq (append final-word-seq (list merged-word)))
            (set! word-apos-alist (assoc-set! word-apos-alist (cons i (1+ i)) merged-word))
            (set! i (1+ i))))
+        ; To merge for example "dr" and "." into one word, just for matching
+        ((and (string=? next-word-str ".")
+              (is-nonbreaking-prefix? current-word-str))
+         (set! final-word-seq (append final-word-seq
+           (list (WordNode (string-append current-word-str ".")))))
+         (set! i (1+ i)))
         ; The current word may also have an apostrophe, make sure to turn
         ; "’" into "'" as well for consistency
         ((> (length current-word-splitted) 1)
          (let ((new-word (WordNode (string-join current-word-splitted "'"))))
            (set! final-word-seq (append final-word-seq (list new-word)))
            (set! word-apos-alist (assoc-set! word-apos-alist (cons i i) new-word))))
         (else (set! final-word-seq (append final-word-seq (list current-word-node)))))))
-
-  (Evaluation ghost-word-seq (List SENT (List final-word-seq)))
+  (Evaluation
+    ghost-word-seq
+    (List SENT
+      (List (map
+        (lambda (w)
+          (WordNode (string-downcase (cog-name w))))
+        final-word-seq))))
 
   ; Generate this lemma-seq only for backward compatibility
   (if (not ghost-with-ecan)
@@ -161,15 +172,34 @@
           (cond
             ; For the next-word-prefix-with-apos? case
             ((assoc-ref word-apos-alist (cons i (1+ i)))
-             (begin
-               (set! i (1+ i))
-               (assoc-ref word-apos-alist (cons (1- i) i))))
+             (set! i (1+ i))
+             (assoc-ref word-apos-alist (cons (1- i) i)))
             ; For having apos in the same word
             ((assoc-ref word-apos-alist (cons i i))
              (assoc-ref word-apos-alist (cons i i)))
+            ; For nonbreaking-prefix like Mr. Mrs. etc
+            ((and (< (1+ i) (length lemma-seq))
+                  (string=? "." (cog-name (list-ref lemma-seq (1+ i))))
+                  (is-nonbreaking-prefix? (cog-name (list-ref lemma-seq i))))
+             (set! i (1+ i))
+             (WordNode (string-append (cog-name (list-ref lemma-seq (1- i))) ".")))
+            ; For time, regardless of the format e.g. "2am", "2 am", "2 a.m." or "2a.m.",
+            ; will all get splitted into two words, and this is a quick workaround for
+            ; the problem of RelEx lemmatizing "a.m." to "be"
+            ((and (< (1+ i) (length lemma-seq))
+                  (string->number (cog-name (list-ref lemma-seq i)))
+                  (string=? "be" (cog-name (list-ref lemma-seq (1+ i)))))
+             (set! i (1+ i))
+             (list (list-ref lemma-seq (1- i)) (list-ref final-word-seq i)))
             ; Just a normal word
             (else (list-ref lemma-seq i)))))))
-    (Evaluation ghost-lemma-seq (List SENT (List final-lemma-seq))))))
+    (Evaluation
+      ghost-lemma-seq
+      (List SENT
+        (List (map
+          (lambda (w)
+            (WordNode (string-downcase (cog-name w))))
+          (flatten final-lemma-seq))))))))
 
 ; ----------
 (define (get-lemma-from-relex WORD)
@@ -194,7 +224,14 @@
 "
   (define seen-lemma (assoc-ref lemma-alist WORD))
   (if (equal? #f seen-lemma)
-      (let ((lemma (get-lemma-from-relex WORD)))
+      (let ((lemma
+              ; Don't bother if it's, say, a personal title like "Mrs."
+              ; or it's time related like a.m. and p.m.
+              (if (or (is-nonbreaking-prefix? WORD)
+                      (string=? "a.m." WORD)
+                      (string=? "p.m." WORD))
+                WORD
+                (get-lemma-from-relex WORD))))
         (set! lemma-alist (assoc-set! lemma-alist WORD lemma))
         lemma)
       seen-lemma))
@@ -274,6 +311,18 @@
         (list x)))
     LST))
 
+ ; ----------
+(define (flatten LST)
+"
+  Flatten a list of lists.
+"
+  (cond ((null? LST) '())
+        ((pair? (car LST))
+         (append (flatten (car LST))
+                 (flatten (cdr LST))))
+        (else (cons (car LST) (flatten (cdr LST)))))
+)
+
 ; ----------
 (define (get-rejoinder-level TYPE)
 "
@@ -321,7 +370,7 @@
 ; ----------
 (define (get-rule-from-label LABEL)
 "
-  Given the label of a rule in string, return the rule with that lavel.
+  Given the label of a rule in string, return the rule with that label.
 "
   (define rule (filter psi-rule?
     (cog-chase-link 'ListLink 'ImplicationLink
@@ -333,3 +382,30 @@
           "Failed to find the GHOST rule \"~a\"" LABEL)
         (list))
       (car rule)))
+
+; ----------
+(define (is-nonbreaking-prefix? WORD)
+"
+  Check if WORD is a personal title.
+"
+  (define lst (list
+    "abp" "adj" "adm" "adv" "asst" "bart" "bp" "bldg" "brig" "bros"
+    "capt" "cmdr" "col" "comdr" "con" "corp" "cpl" "dr" "drs" "ens"
+    "gen" "gov" "hon" "hr" "hosp" "insp" "lt" "maj" "messrs" "mlle"
+    "mm" "mme" "mr" "mrs" "ms" "msgr" "op" "ord" "pfc" "ph" "prof"
+    "pvt" "rep" "reps" "res" "rev" "rt" "sen" "sens" "sfc" "sgt"
+    "sr" "st" "supt" "surg"
+    "abstr" "acad" "acct" "accts" "admin" "agric" "amer" "ar" "arch"
+    "assn" "assoc" "cong" "dept" "econ" "ed" "ess" "evang" "fr"
+    "gaz" "glac" "gr" "hist" "hosp" "inst" "jas" "let" "lett" "libr"
+    "mss" "mt" "org" "phys" "princ" "proc" "prod" "prol" "prov" "pt"
+    "publ" "quot" "quots" "ref" "reg" "rep" "rept" "rev" "roy" "russ"
+    "soc" "tel" "ths" "trad" "transl" "univ" "will" "wk" "wkly" "wlky"
+    "wks" "wm" "yr" "zool" "v" "vs" "i.e" "e.g" "op" "cit" "p.s" "q.v"
+    "viz" "no" "nos" "art" "nr" "pp" "fig" "i" "ii" "p" "seq" "sp"
+    "spec" "specif" "vol" "vols"))
+
+  (or (member (string-downcase WORD) lst)
+      (and (string-suffix? "." WORD)
+           (member (string-downcase (car (string-split WORD #\.))) lst)))
+)