In [17]:
import io
import re
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_lg
import numpy as np
nlp = spacy.load('en')

In [5]:
REMOVED_CHAR = ["/", "%", "*"]
NORMALIZE_DICT = {"/.": ".",
                  "/?": "?",
                  "-LRB-": "(",
                  "-RRB-": ")",
                  "-LCB-": "{",
                  "-RCB-": "}",
                  "-LSB-": "[",
                    "-RSB-": "]"}

In [6]:
def clean_token(token):
    cleaned_token = token
    if cleaned_token in NORMALIZE_DICT:
        cleaned_token = NORMALIZE_DICT[cleaned_token]
    if cleaned_token not in REMOVED_CHAR:
        for char in REMOVED_CHAR:
            cleaned_token = cleaned_token.replace(char, u'')
    if len(cleaned_token) == 0:
        cleaned_token = ","
    return cleaned_token

In [11]:
NO_COREF_LIST = ["i", "me", "my", "you", "your"]

MENTION_TYPE = {"PRONOMINAL": 0, "NOMINAL": 1, "PROPER": 2, "LIST": 3}
MENTION_LABEL = {0: "PRONOMINAL", 1: "NOMINAL", 2: "PROPER", 3: "LIST"}

PROPERS_TAGS = ["NN", "NNS", "NNP", "NNPS"]
ACCEPTED_ENTS = ["PERSON", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"]
WHITESPACE_PATTERN = r"\s+|_+"
UNKNOWN_WORD = "*UNK*"
MISSING_WORD = "<missing>"
MAX_ITER = 100
debug = 0

In [12]:
def extract_mentions_spans(doc):
    '''
    Extract potential mentions from a spacy parsed Doc
    '''
    if debug: print('===== doc ====:', doc)
    for c in doc:
        if debug: print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_)
    # Named entities
    mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS)

    if debug: print("==-- ents:", list(((ent, ent.label_) for ent in mentions_spans)))
    for sent in doc.sents:
        spans, spans_loc = _extract_from_sent(doc, sent, True)
        mentions_spans = mentions_spans + spans
    spans_set = {}
    cleaned_mentions_spans = []
    for spans in mentions_spans:
        if spans.end > spans.start and (spans.start, spans.end) not in spans_set.values():
            cleaned_mentions_spans.append(spans)
            spans_set.update({spans:(spans.start, spans.end)})

    return cleaned_mentions_spans, spans_set

In [13]:
def _extract_from_sent(doc, span, blacklist=True, debug=False):
    '''
    Extract Pronouns and Noun phrases mentions from a spacy Span
    '''
    keep_tags = re.compile(r"N.*|PRP.*|DT|IN")
    leave_dep = ["det", "compound", "appos"]
    keep_dep = ["nsubj", "dobj", "iobj", "pobj"]
    nsubj_or_dep = ["nsubj", "dep"]
    conj_or_prep = ["conj", "prep"]
    remove_pos = ["CCONJ", "INTJ", "ADP"]
    lower_not_end = ["'s", ',', '.', '!', '?', ':', ';']

    # Utility to remove bad endings
    def cleanup_endings(left, right, token):
        minchild_idx = min(left + [token.i]) if left else token.i
        maxchild_idx = max(right + [token.i]) if right else token.i
        # Clean up endings and begginging
        while maxchild_idx >= minchild_idx and (doc[maxchild_idx].pos_ in remove_pos
                                           or doc[maxchild_idx].lower_ in lower_not_end):
            if debug: print("Removing last token", doc[maxchild_idx].lower_, doc[maxchild_idx].tag_)
            maxchild_idx -= 1 # We don't want mentions finishing with 's or conjunctions/punctuation
        while minchild_idx <= maxchild_idx and (doc[minchild_idx].pos_ in remove_pos 
                                           or doc[minchild_idx].lower_ in lower_not_end):
            if debug: print("Removing first token", doc[minchild_idx].lower_, doc[minchild_idx].tag_)
            minchild_idx += 1 # We don't want mentions starting with 's or conjunctions/punctuation
        return minchild_idx, maxchild_idx+1

    mentions_spans = []
    mention_spans_loc = []
    for token in span:
        if debug: print("🚀 tok:", token, "tok.tag_:", token.tag_, "tok.pos_:", token.pos_, "tok.dep_:", token.dep_)

        if blacklist and token.lower_ in NO_COREF_LIST:
            if debug: print("token in no_coref_list")
            continue
        if (not keep_tags.match(token.tag_) or token.dep_ in leave_dep) and not token.dep_ in keep_dep:
            if debug: print("not pronoun or no right dependency")
            continue

        # pronoun
        if re.match(r"PRP.*", token.tag_):
            if debug: print("PRP")
            endIdx = token.i + 1

            span = doc[token.i: endIdx]
            if debug: print("==-- PRP store:", span)
            mentions_spans.append(span)
            mention_spans_loc.append((token.i, endIdx))

            # when pronoun is a part of conjunction (e.g., you and I)
            if token.n_rights > 0 or token.n_lefts > 0:
                span = doc[token.left_edge.i : token.right_edge.i+1]
                if debug: print("==-- in conj store:", span)
                mentions_spans.append(span)
                mention_spans_loc.append((token.left_edge.i, token.right_edge.i+1))
            continue

        # Add NP mention
        if debug:
            print("NP or IN:", token.lower_)
            if token.tag_ == 'IN':
                print("IN tag")
        # Take care of 's
        if token.lower_ == "'s":
            if debug: print("'s detected")
            h = token.head
            j = 0
            while h.head.i != h.i and j < MAX_ITER:
                if debug:
                    print("token head:", h, h.dep_, "head:", h.head)
                    print(id(h.head), id(h))
                if h.dep_ == "nsubj":
                    minchild_idx = min((c.left_edge.i for c in doc if c.head.i == h.head.i and c.dep_ in nsubj_or_dep),
                                       default=token.i)
                    maxchild_idx = max((c.right_edge.i for c in doc if c.head.i == h.head.i and c.dep_ in nsubj_or_dep),
                                       default=token.i)
                    if debug: print("'s', i1:", doc[minchild_idx], " i2:", doc[maxchild_idx])
                    span = doc[minchild_idx : maxchild_idx+1]
                    if debug: print("==-- 's' store:", span)
                    mentions_spans.append(span)
                    mention_spans_loc.append((minchild_idx, maxchild_idx+1))
                    break
                h = h.head
                j += 1
            assert j != MAX_ITER
            continue

        # clean up
        for c in doc:
            if debug and c.head.i == token.i: print("🚧 token in span:", c, "- head & dep:", c.head, c.dep_)
        left = list(c.left_edge.i for c in doc if c.head.i == token.i)
        right = list(c.right_edge.i for c in doc if c.head.i == token.i)
        if token.tag_ == 'IN' and token.dep_ == "mark" and len(left) == 0 and len(right) == 0:
            left = list(c.left_edge.i for c in doc if c.head.i == token.head.i)
            right = list(c.right_edge.i for c in doc if c.head.i == token.head.i)
        if debug:
            print("left side:", left)
            print("right side:", right)
            minchild_idx = min(left) if left else token.i
            maxchild_idx = max(right) if right else token.i
            print("full span:", doc[minchild_idx:maxchild_idx+1])
        start, end = cleanup_endings(left, right, token)
        if start == end:
            continue
        if doc[start].lower_ == "'s":
            continue # we probably already have stored this mention
        span = doc[start:end]
        if debug:
            print("cleaned endings span:", doc[start:end])
            print("==-- full span store:", span)
        mentions_spans.append(span)
        mention_spans_loc.append((start, end))
        if debug and token.tag_ == 'IN':
            print("IN tag")
        if any(tok.dep_ in conj_or_prep for tok in span):
            if debug: print("Conjunction found, storing first element separately")
            for c in doc:
                if c.head.i == token.i and c.dep_ not in conj_or_prep:
                    if debug: print("left no conj:", c, 'dep & edge:', c.dep_, c.left_edge)
                    if debug: print("right no conj:", c, 'dep & edge:', c.dep_, c.right_edge)
            left_no_conj = list(c.left_edge.i for c in doc if c.head.i == token.i and c.dep_ not in conj_or_prep)
            right_no_conj = list(c.right_edge.i for c in doc if c.head.i == token.i and c.dep_ not in conj_or_prep)
            if debug: print("left side no conj:", [doc[i] for i in left_no_conj])
            if debug: print("right side no conj:", [doc[i] for i in right_no_conj])
            start, end = cleanup_endings(left_no_conj, right_no_conj, token)
            if start == end:
                continue
            span = doc[start:end]
            if debug: print("==-- full span store:", span)
            mentions_spans.append(span)
            mention_spans_loc.append((start, end))
    if debug: print("mentions_spans inside", mentions_spans)
    return mentions_spans, mention_spans_loc

In [14]:
doc = nlp('He is Manoj, he went to college')
extract_mentions_spans(doc)

([Manoj, He, he, college],
 {Manoj: (2, 3), He: (0, 1), he: (4, 5), college: (7, 8)})

In [23]:
test_passages = np.load("processed_data/test_passage_list.npy")

In [24]:
len(test_data)

133

In [42]:
test_mentions_list = []
for passage in test_passages:
    doc = nlp(str(passage))
    spans, loc_dict = extract_mentions_spans(doc)
    test_mentions_list.append(loc_dict)

In [43]:
len(test_passages[0].split())

301

In [44]:
(test_mentions_list[0])

{Yang Yang: (59, 61),
 the Beijing Municipal Construction Commission: (189, 194),
 the Communication Commission: (197, 200),
 Traffic: (0, 1),
 the first thing to be impacted when this , cave - in accident occurred: (2,
  16),
 this: (9, 10),
 cave: (11, 12),
 cave - in accident: (11, 15),
 this accident: (19, 21),
 many underground pipes , such as sewage pipes , ah , or water , ah , and gas , ah , pipes: (23,
  45),
 many underground pipes: (23, 26),
 such as sewage pipes , ah , or water , ah , and gas , ah , pipes: (27, 45),
 sewage pipes , ah , or water , ah , and gas , ah , pipes: (29, 45),
 sewage pipes: (29, 31),
 water , ah , and gas: (35, 41),
 water: (35, 36),
 gas: (40, 41),
 water , ah , and gas , ah , pipes: (35, 45),
 that: (46, 47),
 the lives of citizens: (48, 52),
 the lives: (48, 50),
 citizens: (51, 52),
 many other aspects: (53, 56),
 how many pipes: (65, 68),
 seven main types of pipes buried underground: (78, 85),
 seven main types: (78, 81),
 pipes buried undergro

In [51]:
len([word for word in nlp(str(test_passages[0]))])

301

In [40]:
tokens

<generator object <genexpr> at 0x7f8adb347f48>