In [7]:
from concept_extractor import get_nlp_and_matcher
nlp, matcher = get_nlp_and_matcher()

In [189]:
import configparser
import json
import spacy
from spacy.matcher import Matcher
import sys
import timeit
from tqdm import tqdm
import numpy as np

# config = configparser.ConfigParser()
# config.read("paths.cfg")
# with open(config["paths"]["concept_vocab"], "r", encoding="utf8") as f:
#     cpnet_vocab = [l.strip() for l in list(f.readlines())]
# cpnet_vocab = [c.replace("_", " ") for c in cpnet_vocab]


blacklist = set(["-PRON-", "actually", "likely", "possibly", "want",
                 "make", "my", "someone", "sometimes_people", "sometimes","would", "want_to",
                 "one", "something", "sometimes", "everybody", "somebody", "could", "could_be"
                 ])

def lemmatize(nlp, concept):

    doc = nlp(concept.replace("_"," "))
    lcs = set()
    lcs.add("_".join([token.lemma_ for token in doc])) # all lemma
    return lcs

def load_concept_vocab():
    vocab = []
    with open(config["paths"]["concept_vocab"], "r", encoding="utf8") as f:
        vocab = [l.strip() for l in list(f.readlines())]
    concept2id = {}
    id2concept = {}
    for indice, cp in enumerate(vocab):
        concept2id[cp] = indice
        id2concept[indice] = cp
    return concept2id, id2concept

concept2id, id2concept = load_concept_vocab()


def lemmatize(nlp, concept):

    doc = nlp(concept.replace("_"," "))
    lcs = set()
    lcs.add("_".join([token.lemma_ for token in doc])) # all lemma
    return lcs

def load_matcher(nlp):
    config = configparser.ConfigParser()
    config.read("paths.cfg")
    with open(config["paths"]["matcher_patterns"], "r", encoding="utf8") as f:
        all_patterns = json.load(f)

    matcher = Matcher(nlp.vocab)
    for concept, pattern in tqdm(all_patterns.items(), desc="Adding patterns to Matcher."):
        matcher.add(concept, None, pattern)
    return matcher

def ground_mentioned_concepts(nlp, matcher, s, ans = ""):
    global concept2id
    s = s.lower()
    doc = nlp(s)
    matches = matcher(doc)

    mentioned_concepts = {}
    span_to_concepts = {}
    for match_id, start, end in matches:
        span = doc[start:end].text  # the matched span
        if len(set(span.split(" ")).intersection(set(ans.split(" ")))) > 0:
            continue
        original_concept = nlp.vocab.strings[match_id]
        # print("Matched '" + span + "' to the rule '" + string_id)

        if len(original_concept.split("_")) == 1:
            original_concept = list(lemmatize(nlp, original_concept))[0]

        if span not in span_to_concepts:
            span_to_concepts[span] = set()

        span_to_concepts[span].add(original_concept)

    for span, concepts in span_to_concepts.items():
        concepts_sorted = list(concepts)
        concepts_sorted.sort(key=len)

        # mentioned_concepts.update(concepts_sorted[0:2])

        shortest = concepts_sorted[0:3] #
        for c in shortest:
            if c in blacklist:
                continue
            lcs = lemmatize(nlp, c)
            intersect = lcs.intersection(shortest)
            if len(intersect)>0:
                c = list(intersect)[0]
                if c in concept2id:
                    mentioned_concepts[span] = c
                    break
            else:
                if c in concept2id:
                    mentioned_concepts[span] = c
                    break

    
    mentioned_concepts_with_indices = []
    for match_id, start, end in matches:
        span = doc[start:end].text
        if span in mentioned_concepts:
            concept = mentioned_concepts[span]
            concept_id = concept2id[concept]
            mentioned_concepts_with_indices.append([start, end, span, concept, concept_id])

    mentioned_concepts_with_indices = sorted(mentioned_concepts_with_indices, key=lambda x: (x[1],-x[0])) # sort based on end then start
    
    # mentioned_concepts_with_indice with filtered intersection
    res = []
    for mc in reversed(mentioned_concepts_with_indices):
        if len(res) == 0:
            res.append(mc)
        elif mc[1] <= res[-1][0]: # no intersection between current concept, and last included concepts 
            res.append(mc)
    
    res.reverse()
    
    return res

def hard_ground(nlp, sent):
    global cpnet_vocab
    sent = sent.lower()
    doc = nlp(sent)
    res = []
    for idx, t in enumerate(doc):
        if t.lemma_ in cpnet_vocab and t.lemma_ in concept2id:
            concept_id = concept2id[t.lemma_]
            res.append([idx, idx + 1, str(t), str(t.lemma_), concept_id])
    return res

def match_mentioned_concepts(nlp, matcher, sent):
    # print("Begin matching concepts.")
    all_concepts = ground_mentioned_concepts(nlp, matcher, sent)
    if len(all_concepts)==0:
        all_concepts = hard_ground(nlp, sent) # not very possible
        print('hard ground', sent)

    return all_concepts



In [192]:
def hard_ground(nlp, sent):
    global cpnet_vocab
    sent = sent.lower()
    doc = nlp(sent)
    res = []
    for idx, t in enumerate(doc):
        if t.lemma_ in cpnet_vocab and t.lemma_ in concept2id:
            concept_id = concept2id[t.lemma_]
            res.append([idx, idx + 1, t, str(t.lemma_), concept_id])
    return res

st = 'Telemundo is owned by ESPN.'
hard_ground(nlp, st)

[[1, 2, is, 'be', 1452], [2, 3, owned, 'own', 395], [3, 4, by, 'by', 2749]]

In [232]:
s = "Debonding Abaddon debonding abaddon remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens ."
print(nlp(s))
concepts = match_mentioned_concepts(nlp, matcher, sent=s)
print(concepts)

Debonding Abaddon debonding abaddon remained the team 's starting quarterback for the rest of the season and went on to lead the 49ers to their first Super Bowl appearance since 1994 , losing to the Baltimore Ravens .
[[1, 2, 'abaddon', 'abaddon', 79949], [3, 4, 'abaddon', 'abaddon', 79949], [4, 5, 'remained', 'remain', 3085], [6, 7, 'team', 'team', 1736], [8, 9, 'starting', 'start', 13762], [9, 10, 'quarterback', 'quarterback', 48514], [12, 13, 'rest', 'rest', 309], [15, 16, 'season', 'season', 6760], [17, 19, 'went on', 'go_on', 9784], [20, 21, 'lead', 'lead', 7235], [24, 25, 'their', 'mine', 19735], [25, 26, 'first', 'first', 3945], [26, 28, 'super bowl', 'super_bowl', 199537], [28, 29, 'appearance', 'appearance', 1263], [29, 30, 'since', 'since', 14293], [32, 33, 'losing', 'lose', 4705], [35, 36, 'baltimore', 'baltimore', 18538]]


In [233]:
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('../../bert_base/', do_lower_case=False)

In [234]:
tokens = tokenizer.tokenize(s)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print(tokens, len(tokens))
print(input_ids, len(input_ids))

['De', '##bon', '##ding', 'A', '##bad', '##don', 'de', '##bon', '##ding', 'a', '##bad', '##don', 'remained', 'the', 'team', "'", 's', 'starting', 'quarterback', 'for', 'the', 'rest', 'of', 'the', 'season', 'and', 'went', 'on', 'to', 'lead', 'the', '49ers', 'to', 'their', 'first', 'Super', 'Bowl', 'appearance', 'since', '1994', ',', 'losing', 'to', 'the', 'Baltimore', 'Ravens', '.'] 47
[3177, 8868, 3408, 138, 8330, 3842, 1260, 8868, 3408, 170, 8330, 3842, 1915, 1103, 1264, 112, 188, 2547, 9119, 1111, 1103, 1832, 1104, 1103, 1265, 1105, 1355, 1113, 1106, 1730, 1103, 19531, 1106, 1147, 1148, 3198, 5308, 2468, 1290, 1898, 117, 3196, 1106, 1103, 5553, 21848, 119] 47


In [183]:
def bert_concept_alignment(tok2id, words, concepts):
    word_id2concept_id = [0] * len(words)
    for i in range(len(words)):
        word_id2concept_id[i] = i
    
    concept_id = 0
    s_idx = 0
    for word_id, word in enumerate(words):
        if concept_id == len(concepts):
            word_id2concept_id[word_id] = -1
            continue
        word_concept = concepts[concept_id][2]
        word_id2concept_id[word_id] = concept_id
        cur_span = ' '.join(words[s_idx:(word_id + 1)]).lower()
        
#         print('concept_idx={}, concept={}, word_idx={}, word={}, cur_span={}, head_i={}'.format(concept_id, word_concept, word_id, word, cur_span, word_id2concept_id[word_id]))
        if cur_span.lower() == word_concept.lower():
            concept_id += 1
            s_idx = word_id + 1
        elif not word_concept.lower().startswith((cur_span+' ')): #current word does not belong to any concept
            s_idx = word_id + 1
            word_id2concept_id[word_id] = -1

    for word_id, word in enumerate(words):
        concept_id = word_id2concept_id[word_id]
        concept = concepts[concept_id] if concept_id != -1 else 'NONE'
        print('word = {}, concept = {}'.format(word, concept))
    pass

def combine_berttoken_concept(tokens, concepts):
    
    tok2id = [0] * len(tokens)
    cur_indice = -1
    
    words = []
    cur_word = ''
    for idx, token in enumerate(tokens):
        if not token.startswith('##'):
            if cur_indice != -1:
                words.append(cur_word)
            cur_indice += 1
            cur_word = token
        else:
            cur_word += token[2:]
        tok2id[idx] = cur_indice
    words.append(cur_word)
    
    print(tokens, len(tokens))
    print(tok2id, len(tok2id))
    print(words, len(words))
    print(concepts, len(concepts))
    
    tok2id = bert_concept_alignment(tok2id, words, concepts)
    
combine_berttoken_concept(tokens, concepts)

['A', '##bad', '##don', 'remained', 'the', 'team', "'", 's', 'starting', 'quarterback', 'for', 'the', 'rest', 'of', 'the', 'season', 'and', 'went', 'on', 'to', 'lead', 'the', '49ers', 'to', 'their', 'first', 'Super', 'Bowl', 'appearance', 'since', '1994', ',', 'losing', 'to', 'the', 'Baltimore', 'Ravens', '.'] 38
[0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35] 38
['Abaddon', 'remained', 'the', 'team', "'", 's', 'starting', 'quarterback', 'for', 'the', 'rest', 'of', 'the', 'season', 'and', 'went', 'on', 'to', 'lead', 'the', '49ers', 'to', 'their', 'first', 'Super', 'Bowl', 'appearance', 'since', '1994', ',', 'losing', 'to', 'the', 'Baltimore', 'Ravens', '.'] 36
[[0, 1, 'abaddon', 'abaddon', 79949], [1, 2, 'remained', 'remain', 3085], [3, 4, 'team', 'team', 1736], [5, 6, 'starting', 'start', 13762], [6, 7, 'quarterback', 'quarterback', 48514], [9, 10, 'rest', 'rest', 309], [12, 13, 'season', 'sea

In [254]:
def do_need_append_token(span, concept):
    span = span.lower()
    word_concept = concept[2].lower() # get the ori word, not the concept
    
    if span == word_concept: # exact match, no need to append more token
        return False
    if not word_concept.startswith(span+' '): # current span is completly different with the concept. no need to append
        return False
    return True

def match_span_concept(span, concept):
    span = span.lower()
    word_concept = concept[2].lower() # get the ori word, not the concept
    
    if span == word_concept:
        return True
    return False

def bert_concept_alignment(tokens, concepts):
    n_token = len(tokens)
    n_concept = len(concepts)
    
    token_id2concept_id = np.zeros((n_token), dtype=int)
    tok2id = np.zeros((n_token), dtype=int)
    for i in range(n_token):
        token_id2concept_id[i] = i
        tok2id[i] = i
        
    s_id = 0
    e_id = 0
    concept_id = 0
    current_span = ''
    while s_id != n_token:
        # process sub-word level
        next_token_id = s_id + 1
        current_span = tokens[s_id]
        while next_token_id < n_token and tokens[next_token_id].startswith('##'):
            current_span += tokens[next_token_id][2:] # remove ##
            next_token_id += 1
        
        # let's see if combining next token will form a concept
        e_id = next_token_id
        while concept_id < n_concept and do_need_append_token(current_span, concepts[concept_id]):
            current_span += (' ' + tokens[e_id])
            e_id += 1
        
        # if current span match with current concept
        if concept_id < n_concept and match_span_concept(current_span, concepts[concept_id]):
            token_id2concept_id[s_id:e_id] = concept_id
            concept_id += 1
        else:
            token_id2concept_id[s_id:e_id] = -1
            
        s_id = e_id
    
    assert concept_id == n_concept
    
    cur_id = -1
    for idx in range(n_token):
        if token_id2concept_id[idx] == -1:
            cur_id += 1
        elif idx == 0 or token_id2concept_id[idx] != token_id2concept_id[idx - 1]:
            cur_id += 1
        
        tok2id[idx] = cur_id
    
    # merge same token into span
    final_tok2id = list(range(0, tok2id[-1] + 1))
    final_token_id2concept_id = []
    for idx in range(n_token):
        if idx == 0 or tok2id[idx] == -1 or tok2id[idx] != tok2id[idx - 1]:
            final_token_id2concept_id.append(token_id2concept_id[idx])
    
    assert len(final_tok2id) == len(final_token_id2concept_id)
    
    # create token pooling mask to pool subword level to span level. we use average pooling
    token_pooling_mask = []
    s_id = 0
    e_id = 0
    while s_id != n_token:
        e_id = s_id + 1
        
        while e_id < n_token and tok2id[s_id] != -1 and tok2id[s_id] == tok2id[e_id]:
            e_id += 1

        mask = np.zeros((n_token), dtype=float)
        n = e_id - s_id
        mask[s_id:e_id] = (1/n)
        token_pooling_mask.append(mask)
        s_id = e_id
    
    assert len(token_pooling_mask) == len(final_tok2id)
    return final_tok2id, final_token_id2concept_id, token_pooling_mask

def combine_berttoken_concept(tokens, concepts):
    tok2id = bert_concept_alignment(tokens, concepts)
    
combine_berttoken_concept(tokens, concepts)