Skip to content

Commit

Permalink
Merge pull request #33 from leungmanhin/ghost-dot
Browse files Browse the repository at this point in the history
Fix issues with dots and numbers in Ghost
  • Loading branch information
leungmanhin authored Oct 25, 2018
2 parents 989a24f + 66ac466 commit 5bd74e1
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 34 deletions.
17 changes: 14 additions & 3 deletions opencog/ghost/cs-parse.scm
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,20 @@
((has-match? "!" str) (result:suffix 'NOT location #f))
((has-match? "[?]" str) (result:suffix '? location "?"))
((has-match? "=" str) (result:suffix 'EQUAL location #f))
; For time -- a.m. and p.m.
; Just catch it to avoid it being splitted into multiple words
; Should be done before any literal / lemma matching
((has-match? "[ap]\\.m\\." str)
(result:suffix 'STRING location
(string-trim-both (match:substring current-match))))
; Words with apostrophe, e.g. I'm, it's etc
((has-match? "[a-zA-Z]+['’][a-zA-Z]+" str)
(result:suffix 'LITERAL_APOS location
(string-trim-both (match:substring current-match))))
; Literals for example: Mr. Dr. etc
((has-match? "[a-zA-Z]+\\." str)
(result:suffix 'LITERAL location
(string-trim-both (match:substring current-match))))
; Literals -- words start with a '
((has-match? "'[a-zA-Z]+\\b" str)
(result:suffix 'LITERAL location
Expand All @@ -158,15 +168,15 @@
; Literals, words in the pattern that are not in their canonical forms
(result:suffix 'LITERAL location
(string-trim-both (match:substring current-match)))))
((has-match? "[0-9]+[0-9.]*" str)
((has-match? "[0-9]+[0-9.]*\\b" str)
(result:suffix 'NUM location
(string-trim-both (match:substring current-match))))
((has-match? "[|]" str)
(result:suffix 'VLINE location
(string-trim-both (match:substring current-match))))
(string-trim-both (match:substring current-match))))
((has-match? "," str)
(result:suffix 'COMMA location
(string-trim-both (match:substring current-match))))
(string-trim-both (match:substring current-match))))
; This should always be near the end, because it is broadest of all.
((has-match? "[~’'._!?0-9a-zA-Z-]+" str)
(result:suffix 'STRING location
Expand Down Expand Up @@ -507,6 +517,7 @@

(literal
(LITERAL) : (format #f "(cons 'word \"~a\")" $1)
(NUM) : (format #f "(cons 'word \"~a\")" $1)
(STRING) : (format #f "(cons 'word \"~a\")" $1)
)

Expand Down
56 changes: 33 additions & 23 deletions opencog/ghost/terms.scm
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,29 @@
"
Occurrence of a word, a word that should be matched literally.
"
(let* ((str-dc (if (string=? STR "I") STR (string-downcase STR)))
(v1 (WordNode str-dc))
(v2 (Variable (gen-var str-dc #f)))
(l (WordNode (get-lemma str-dc)))
(v (list (TypedVariable v2 (Type "WordInstanceNode"))))
(c (list (WordInstanceLink v2 (Variable "$P"))
(ReferenceLink v2 v1))))
(list v c (list v1) (list l))))
(let* ((str-dc (string-downcase STR))
; Special handling for time that's written as a single word
; e.g. 2pm, 10am etc
; For the input, regardless of the format, e.g. "2am", "2 am",
; "2 a.m." or "2a.m.", will all get splitted into two words
(is-time? (string-match "[0-9]{1,2}[ ]*[ap][.]*m[.]*" str-dc))
(is-two-digits?
(and is-time? (char-numeric? (string-ref str-dc 1))))
(time-1pt
(if is-two-digits?
(substring str-dc 0 2)
(substring str-dc 0 1)))
(time-2pt
(if is-two-digits?
(substring str-dc 2)
(substring str-dc 1))))
(list (list) (list)
(if is-time?
(list (WordNode time-1pt) (WordNode time-2pt))
(list (WordNode str-dc)))
(if is-time?
(list (WordNode time-1pt) (WordNode time-2pt))
(list (WordNode (get-lemma str-dc)))))))

; ----------
(define (word-apos STR)
Expand All @@ -23,33 +38,28 @@
"
(let* (; This turns ’ into ' just to treat them as the same thing
(nstr (regexp-substitute/global #f "" STR 'pre "'" 'post))
(l (WordNode
(if (string-prefix? "I'" nstr) nstr (string-downcase nstr)))))
(list (list) (list) (list l) (list l))))
(w (WordNode (string-downcase nstr))))
(list (list) (list) (list w) (list w))))

; ----------
(define (lemma STR)
"
Lemma occurrence, aka canonical form of a term.
This is the default for word mentions in the rule pattern.
"
(let* ((str-dc (if (string=? STR "I") STR (string-downcase STR)))
(v1 (Variable (gen-var str-dc #t)))
(v2 (Variable (gen-var str-dc #f)))
(let* ((str-dc (string-downcase STR))
(var (Variable (gen-var str-dc #t)))
(l (WordNode (get-lemma str-dc)))
(v (list (TypedVariable v1 (Type "WordNode"))
(TypedVariable v2 (Type "WordInstanceNode"))))
(c (list (ReferenceLink v2 v1)
; In some rare situation, particularly if the input
(v (list (TypedVariable var (Type "WordNode"))))
(c (list ; In some rare situation, particularly if the input
; sentence is not grammatical, RelEx may not lemmatize a
; word because of the ambiguity
; So just to be sure "l" is the stem of "v1",
; So just to be sure "l" is the stem of "var",
; a GroundedPredicateNode is used instead of putting
; "(LemmaLink v2 l)" in the context
(Evaluation (GroundedPredicate "scm: ghost-lemma?")
(List v1 l))
(WordInstanceLink v2 (Variable "$P")))))
(list v c (list v1) (list l))))
(List var l)))))
(list v c (list var) (list l))))

(define-public (ghost-lemma? GRD LEMMA)
"
Expand Down Expand Up @@ -557,7 +567,7 @@
((string=? "equal" OPERATOR)
(if both-numbers?
(= lv-num rv-num)
(string=? lv-str rv-str)))
(string-ci=? lv-str rv-str)))
((string=? "smaller" OPERATOR)
(and both-numbers?
(< lv-num rv-num)))
Expand Down
92 changes: 84 additions & 8 deletions opencog/ghost/utils.scm
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,26 @@
(set! final-word-seq (append final-word-seq (list merged-word)))
(set! word-apos-alist (assoc-set! word-apos-alist (cons i (1+ i)) merged-word))
(set! i (1+ i))))
; To merge for example "dr" and "." into one word, just for matching
((and (string=? next-word-str ".")
(is-nonbreaking-prefix? current-word-str))
(set! final-word-seq (append final-word-seq
(list (WordNode (string-append current-word-str ".")))))
(set! i (1+ i)))
; The current word may also have an apostrophe, make sure to turn
; "’" into "'" as well for consistency
((> (length current-word-splitted) 1)
(let ((new-word (WordNode (string-join current-word-splitted "'"))))
(set! final-word-seq (append final-word-seq (list new-word)))
(set! word-apos-alist (assoc-set! word-apos-alist (cons i i) new-word))))
(else (set! final-word-seq (append final-word-seq (list current-word-node)))))))

(Evaluation ghost-word-seq (List SENT (List final-word-seq)))
(Evaluation
ghost-word-seq
(List SENT
(List (map
(lambda (w)
(WordNode (string-downcase (cog-name w))))
final-word-seq))))

; Generate this lemma-seq only for backward compatibility
(if (not ghost-with-ecan)
Expand Down Expand Up @@ -161,15 +172,34 @@
(cond
; For the next-word-prefix-with-apos? case
((assoc-ref word-apos-alist (cons i (1+ i)))
(begin
(set! i (1+ i))
(assoc-ref word-apos-alist (cons (1- i) i))))
(set! i (1+ i))
(assoc-ref word-apos-alist (cons (1- i) i)))
; For having apos in the same word
((assoc-ref word-apos-alist (cons i i))
(assoc-ref word-apos-alist (cons i i)))
; For nonbreaking-prefix like Mr. Mrs. etc
((and (< (1+ i) (length lemma-seq))
(string=? "." (cog-name (list-ref lemma-seq (1+ i))))
(is-nonbreaking-prefix? (cog-name (list-ref lemma-seq i))))
(set! i (1+ i))
(WordNode (string-append (cog-name (list-ref lemma-seq (1- i))) ".")))
; For time, regardless of the format e.g. "2am", "2 am", "2 a.m." or "2a.m.",
; will all get splitted into two words, and this is a quick workaround for
; the problem of RelEx lemmatizing "a.m." to "be"
((and (< (1+ i) (length lemma-seq))
(string->number (cog-name (list-ref lemma-seq i)))
(string=? "be" (cog-name (list-ref lemma-seq (1+ i)))))
(set! i (1+ i))
(list (list-ref lemma-seq (1- i)) (list-ref final-word-seq i)))
; Just a normal word
(else (list-ref lemma-seq i)))))))
(Evaluation ghost-lemma-seq (List SENT (List final-lemma-seq))))))
(Evaluation
ghost-lemma-seq
(List SENT
(List (map
(lambda (w)
(WordNode (string-downcase (cog-name w))))
(flatten final-lemma-seq))))))))

; ----------
(define (get-lemma-from-relex WORD)
Expand All @@ -194,7 +224,14 @@
"
(define seen-lemma (assoc-ref lemma-alist WORD))
(if (equal? #f seen-lemma)
(let ((lemma (get-lemma-from-relex WORD)))
(let ((lemma
; Don't bother if it's, say, a personal title like "Mrs."
; or it's time related like a.m. and p.m.
(if (or (is-nonbreaking-prefix? WORD)
(string=? "a.m." WORD)
(string=? "p.m." WORD))
WORD
(get-lemma-from-relex WORD))))
(set! lemma-alist (assoc-set! lemma-alist WORD lemma))
lemma)
seen-lemma))
Expand Down Expand Up @@ -274,6 +311,18 @@
(list x)))
LST))

; ----------
(define (flatten LST)
"
Flatten a list of lists.
"
(cond ((null? LST) '())
((pair? (car LST))
(append (flatten (car LST))
(flatten (cdr LST))))
(else (cons (car LST) (flatten (cdr LST)))))
)

; ----------
(define (get-rejoinder-level TYPE)
"
Expand Down Expand Up @@ -321,7 +370,7 @@
; ----------
(define (get-rule-from-label LABEL)
"
Given the label of a rule in string, return the rule with that lavel.
Given the label of a rule in string, return the rule with that label.
"
(define rule (filter psi-rule?
(cog-chase-link 'ListLink 'ImplicationLink
Expand All @@ -333,3 +382,30 @@
"Failed to find the GHOST rule \"~a\"" LABEL)
(list))
(car rule)))

; ----------
(define (is-nonbreaking-prefix? WORD)
"
Check if WORD is a personal title.
"
(define lst (list
"abp" "adj" "adm" "adv" "asst" "bart" "bp" "bldg" "brig" "bros"
"capt" "cmdr" "col" "comdr" "con" "corp" "cpl" "dr" "drs" "ens"
"gen" "gov" "hon" "hr" "hosp" "insp" "lt" "maj" "messrs" "mlle"
"mm" "mme" "mr" "mrs" "ms" "msgr" "op" "ord" "pfc" "ph" "prof"
"pvt" "rep" "reps" "res" "rev" "rt" "sen" "sens" "sfc" "sgt"
"sr" "st" "supt" "surg"
"abstr" "acad" "acct" "accts" "admin" "agric" "amer" "ar" "arch"
"assn" "assoc" "cong" "dept" "econ" "ed" "ess" "evang" "fr"
"gaz" "glac" "gr" "hist" "hosp" "inst" "jas" "let" "lett" "libr"
"mss" "mt" "org" "phys" "princ" "proc" "prod" "prol" "prov" "pt"
"publ" "quot" "quots" "ref" "reg" "rep" "rept" "rev" "roy" "russ"
"soc" "tel" "ths" "trad" "transl" "univ" "will" "wk" "wkly" "wlky"
"wks" "wm" "yr" "zool" "v" "vs" "i.e" "e.g" "op" "cit" "p.s" "q.v"
"viz" "no" "nos" "art" "nr" "pp" "fig" "i" "ii" "p" "seq" "sp"
"spec" "specif" "vol" "vols"))

(or (member (string-downcase WORD) lst)
(and (string-suffix? "." WORD)
(member (string-downcase (car (string-split WORD #\.))) lst)))
)

0 comments on commit 5bd74e1

Please sign in to comment.