In [1]:
import csv
import itertools
import re
import spacy
import pandas as pd
import networkx as nx
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [2]:
# Spacy has some default rules for spliting text into sentences, as our text is already split
# disabled this feature
def prevent_sentence_boundary_detection(doc):
    for token in doc:
        # This will entirely disable spaCy's sentence detection
        token.is_sent_start = False
    return doc

nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', before='parser')

In [3]:
def join_bad_splits(parsed):
    """ Default tokenizer splits over some or all '-'s do this as adding rules wasn't working"""
    for token in parsed:
        if re.fullmatch(r'[A-Z]', token.text) is not None:
            i = token.i
            if i == 0:
                continue
            with parsed.retokenize() as retokenizer:
                retokenizer.merge(parsed[i-1:i+1])
            return join_bad_splits(parsed)
        if token.text == '-':
            i = token.i
            with parsed.retokenize() as retokenizer:
                retokenizer.merge(parsed[i-1:i+2])
            # Merging removes a token, so iterating over the list goes out of index    
            return join_bad_splits(parsed)
    return parsed

In [4]:
def find_ner_term(ner, token):
    """ Check if the ner term matches the token, if there is punctuation in the ner,
        check if it is a substring of the token"""
    subtokens = re.split(r'[\.\,\+\*/-]', token)
    ner_split = re.split(r'[\.\,\+\*/-]', token)
    if len(ner_split) != 1:
        return ner in token
    return ner == token or ner in subtokens

In [18]:
def ancestor_negation(gene, disease, doc):
    """ Returns a list of booleans for whether each ancestor is negated in order from
        root -> most common in parse tree. Returns an int"""
    gene_ancestors = []
    dis_ancestors = []
    # Get ancestors for each gene token
    for token in doc:
        if find_ner_term(gene, token.text):
            # Need to reverse list an select the first before they are different
            gene_ancestors.append([a.i for a in token.ancestors][::-1])
        if find_ner_term(disease, token.text):
            dis_ancestors.append([a.i for a in token.ancestors][::-1])
    pairs = [(g,d) for g in gene_ancestors for d in dis_ancestors]
    common_ancestors = []
    for p in pairs:
        common = []
        for gene_ancestor, disease_ancestor in zip(p[0], p[1]):
            if gene_ancestor == disease_ancestor:
                common.append(disease_ancestor)   
            # if they are different the trees diverge
            else:
                break
        common_ancestors += common
    common_ancestors = set(common_ancestors) # In case there are multiple pairs, shouldn't be anymore
    negations = []
    for token in doc:
        if token.dep_ == 'neg':
            negations.append(token.head.i)
    return [(c in negations) for c in common_ancestors]

In [7]:
def process_ner(x):
    return x.upper().replace(' ', '_')

data = pd.read_csv('../dataset/GAD_Y_N_wPubmedID_annotated_cap.csv', usecols=[2, 6, 9, 11], skiprows = [0],
                   header=None, names=['rel', 'gene', 'disease', 'sentence'])

data.gene = data.gene.apply(process_ner)
data.disease = data.disease.apply(process_ner)
data.rel = data.rel.apply(lambda x: x == 'Y')

In [8]:
docs = []
for index, entry in data.iterrows():
    docs.append(join_bad_splits(nlp(entry.sentence)))

In [20]:
# Assuming 'neg'  present, detect 378 with most common word, detect 465 other wise
i = 0
j = 0
for d, e in zip(docs, data.iterrows()):
    ancestors = ancestor_negation(e[1].gene, e[1].disease, d)
    shared = step_tree(e[1].gene, d) + step_tree(e[1].disease, d)
    non_noun_neg = negated_ancestor_noun(d)
    noun_approach = [s in non_noun_neg for s in shared]
    if any([token.dep_ == 'neg' for token in d]):
        i += 1
        if any(ancestors) != any(noun_approach):
            #print(any(ancestors), any(noun_approach), e[1].sentence, '\n')
            j += 1
print(i, j)

604 63


In [12]:
def negated_root(doc):
    """ Returns whether the root node is negated in the document."""
    flag = False
    for token in doc:    
        if token.dep_ == 'neg' and token.head.dep_ == 'ROOT':
            flag = not flag # Need to apply each itme the root is negated
    return flag

In [10]:
def negation_presence(doc):
    """ Returns whether there are any negation dependencies in the document."""
    return any([token.dep_ == 'neg' for token in doc])

In [11]:
def negated_noun_chunk(term, doc):
    """ Returns wehther the noun chunk is negated. """
    for chunk in doc.noun_chunks:
        print(chunk)

In [121]:
test = nlp('based on these data, ABCB6 is not the causative gene for GRACILE_SYNDROME.')

In [16]:
def negated_ancestor_noun(doc):
    """ Gives the indexes for the first ancestor of a negation word which is not a noun,
        for all negations in a document"""
    indexes = []
    for token in doc:
        if token.dep_ == 'neg':
            head = token.head
            while head.dep_ != 'ROOT':
                if head.pos_ == 'NOUN':
                    head = head.head
                else:
                    indexes.append(head.i)
                    break
            if head.dep_ == 'ROOT':
                indexes.append(head.i)
    return indexes

In [127]:
negated_ancestor_noun(test)

[6]

In [14]:
def step_tree(term, doc):
    """ Gives the token indexes for the ancestors of a given term"""
    indexes = []
    for token in doc:
        if token.text == term:
            head = token
            while head.dep_ != 'ROOT':
                head = head.head
                indexes.append(head.i)
    return indexes

In [129]:
negated_ancestor_noun(test) in step_tree('ACE', test)

False

In [119]:
step_tree('ACE', test)

[14, 4, 18, 2]