In [1]:
import csv
import itertools
import re
import string
import spacy
import pandas as pd
import networkx as nx
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [2]:
# Spacy has some default rules for spliting text into sentences, as our text is already split
# disabled this feature
def prevent_sentence_boundary_detection(doc):
    for token in doc:
        # This will entirely disable spaCy's sentence detection
        token.is_sent_start = False
    return doc

nlp.add_pipe(prevent_sentence_boundary_detection, name='prevent-sbd', before='parser')

In [3]:
def join_bad_splits(parsed):
    """ Default tokenizer splits over some or all '-'s do this as adding rules wasn't working"""
    for token in parsed:
        if re.fullmatch(r'[A-Z]', token.text) is not None:
            i = token.i
            if i == 0:
                continue
            with parsed.retokenize() as retokenizer:
                retokenizer.merge(parsed[i-1:i+1])
            return join_bad_splits(parsed)
        if token.text == '-':
            i = token.i
            with parsed.retokenize() as retokenizer:
                retokenizer.merge(parsed[i-1:i+2])
            # Merging removes a token, so iterating over the list goes out of index    
            return join_bad_splits(parsed)
    return parsed

In [4]:
def get_shortest_path(graph, pairs):
    """ Gets the shortest dependency tree paths for each pair"""
    path_lens = []
    for p in pairs:
        try:
            path_lens.append(nx.shortest_path_length(graph, p[0], p[1]))
        except:
            continue
    if len(path_lens) == 0:
        return [-1]
    return path_lens

In [5]:
def find_ner_term(ner, token):
    """ Check if the ner term matches the token, if there is punctuation in the ner,
        check if it is a substring of the token"""
    subtokens = re.split(r'[\.\,\+\*/-]', token)
    ner_split = re.split(r'[\.\,\+\*/-]', token)
    if len(ner_split) != 1:
        return ner in token
    return ner == token or ner in subtokens

In [6]:
def tree_distance(gene, disease, parsed):
    """ Get the minimum, maxium, and average minimal dep tree distance for the terms in a sentence"""
    edges = []
    gene_mentions = []
    disease_mentions = []
    for token in parsed:
        token_format = '{0}-{1}'.format(token.text, token.i)
        if find_ner_term(gene, token.text):
            gene_mentions.append(token_format)
        if find_ner_term(disease, token.text):
            disease_mentions.append(token_format)
        for child in token.children:
            edges.append((token_format, '{0}-{1}'.format(child.text, child.i)))
    graph = nx.Graph(edges)
    pairs = [(g, d) for g in gene_mentions for d in disease_mentions]
    min_dists = get_shortest_path(graph, pairs)
    if len(min_dists) == 0:
        min_dists = [-1]
    word_dists = [abs(int(p[0].rsplit('-', 1)[1]) - int(p[1].rsplit('-', 1)[1])) for p in pairs]
    try:
        return [min_dists[0], word_dists[0]]  # Currently only 1 pair per sentence given tags
    except:
        print(gene, disease, [t.text for t in parsed], parsed[-1].text)

In [7]:
def common_ancestor(gene, disease, doc):
    """ Finds the closest ancestor for gene/disease """
    gene_ancestors = []
    dis_ancestors = []
    # Get ancestors for each gene token
    for token in doc:
        if find_ner_term(gene, token.text):
            # Need to reverse list an select the first before they are different
            gene_ancestors.append([(a.text, a.i) for a in token.ancestors][::-1])
        if find_ner_term(disease, token.text):
            dis_ancestors.append([(a.text, a.i) for a in token.ancestors][::-1])
    pairs = [(g,d) for g in gene_ancestors for d in dis_ancestors]
    common_ancestors = []
    for p in pairs:
        common = ''
        depth = -1
        for gene_ancestor, disease_ancestor in zip(p[0], p[1]):
            if gene_ancestor == disease_ancestor:
                common = disease_ancestor[0]
                depth += 1 
            # if they are different the trees diverge
            else:
                break
        common_ancestors.append((common, depth, len(p[0]) - depth, len(p[1]) - depth))
    return common_ancestors

In [8]:
def pos_dist(doc, counts):
    """ Gives the normalized (sum of tags = 1) pos distribution"""
    counter = {k:0 for k in list(counts.keys())}
    for token in doc:
        if token.pos_ in counter:
            counter[token.pos_] += 1
        else:
            # The X POS tag is other, can be used if POS not present in main counts
            counter['X'] += 1
    # Normalize counts to sum to 1
    return [x/len(doc) for x in list(counter.values())]

In [9]:
def chunk_root_normalized(doc, counts):
    """ Gives the normalized count of chunk value by # of chunks  for top 100 lemma roots of 
        chunks in training set"""
    counter = {k:0 for k in [x[0] for x in counts.most_common(100)]}
    n_chunks = 0
    for chunk in doc.noun_chunks:
        n_chunks += 1
        if chunk.root.lemma_ in counter:
            counter[chunk.root.lemma_] += 1
    # Normalize counts
    return [x/n_chunks for x in list(counter.values())]

In [10]:
def chunk_head_normalized(doc, counts):
    """ Gives the normalized count of chunk value by # of chunks  for top 100 lemma heads of 
        chunks in training set"""
    counter = {k:0 for k in [x[0] for x in counts.most_common(100)]}
    n_chunks = 0
    for chunk in doc.noun_chunks:
        n_chunks += 1
        if chunk.root.head.lemma_ in counter:
            counter[chunk.root.head.lemma_] += 1
    # Normalize counts
    return [x/n_chunks for x in list(counter.values())]

In [11]:
def negation_presence(doc):
    """ Returns whether there are any negation dependencies in the document."""
    return any([token.dep_ == 'neg' for token in doc])

In [12]:
def ancestor_negation(gene, disease, doc):
    """ Returns an int for whether an ancestor is negated in order from
        root -> most common in parse tree. """
    gene_ancestors = []
    dis_ancestors = []
    # Get ancestors for each gene token
    for token in doc:
        if find_ner_term(gene, token.text):
            # Need to reverse list an select the first before they are different
            gene_ancestors.append([a.i for a in token.ancestors][::-1])
        if find_ner_term(disease, token.text):
            dis_ancestors.append([a.i for a in token.ancestors][::-1])
    pairs = [(g,d) for g in gene_ancestors for d in dis_ancestors] # Should only be 1 pair in this set
    common_ancestors = []
    for p in pairs:
        common = []
        for gene_ancestor, disease_ancestor in zip(p[0], p[1]):
            if gene_ancestor == disease_ancestor:
                common.append(disease_ancestor)   
            # if they are different the trees diverge
            else:
                break
        common_ancestors += common
    negations = []
    for token in doc:
        if token.dep_ == 'neg':
            negations.append(token.head.i)
    return int(any([c in negations for c in common_ancestors]))

In [13]:
def negated_ancestor_noun(doc):
    """ Gives the indexes for the first ancestor of a negation word which is not a noun,
        for all negations in a document"""
    indexes = []
    for token in doc:
        if token.dep_ == 'neg':
            head = token.head
            while head.dep_ != 'ROOT':
                if head.pos_ == 'NOUN':
                    head = head.head
                else:
                    indexes.append(head.i)
                    break
            if head.dep_ == 'ROOT':
                indexes.append(head.i)
    return indexes

def step_tree(term, doc):
    """ Gives the token indexes for the ancestors of a given term"""
    indexes = []
    for token in doc:
        if token.text == term:
            head = token
            while head.dep_ != 'ROOT':
                head = head.head
                indexes.append(head.i)
    return indexes

def noun_chunk_negated(gene, disease, doc):
    """ Reuturns an int if one of the noun chunks in the paths to either the
        disease or the gene are negated"""
    shared = step_tree(gene, doc) + step_tree(disease, doc)
    non_noun_neg = negated_ancestor_noun(doc)
    return int(any([s in non_noun_neg for s in shared]))

In [14]:
doc_draw = join_bad_splits(nlp("the results do not suggest a contribution of A2M and lrp to the development of MS."))
html=spacy.displacy.render(doc_draw, style='dep', page=True, options={'distance': 100, 'compact':True})
with open('/home/jsilva/winFiles/Downloads/output.html', 'w') as fp:
    fp.write(html)

In [16]:
data = pd.read_csv('../dataset/final_train.csv', header=None, skiprows=[0],
                  names=['id', 'assoc', 'gene', 'disease', 'sentence'])
docs = []
for index, entry in data.iterrows():
    docs.append(join_bad_splits(nlp(entry.sentence)))
    
pos_counts = Counter()
for doc in docs:
    for token in doc:
        pos_counts[token.pos_] += 1

chunk_roots = Counter()
for doc in docs:
    for chunk in doc.noun_chunks:
        chunk_roots[chunk.root.lemma_] += 1

chunk_heads = Counter()
for doc in docs:
    for chunk in doc.noun_chunks:
        chunk_heads[chunk.root.head.lemma_] += 1
# This makes the feature data frame
features = pd.DataFrame()
tree_dists = []
word_dists = []
for e, d in zip(data.iterrows(), docs):
    result = tree_distance(e[1].gene, e[1].disease, d)
    tree_dists.append(result[0])
    word_dists.append(result[1])
features['tree_dists'] = tree_dists
features['word_dists'] = word_dists
common_words = []
common_depth = []
gene_fork_len = []
disease_fork_len = []
for d, e in zip(docs, data.iterrows()):
    result = common_ancestor(e[1].gene, e[1].disease, d)[0]
    common_words.append(result[0])
    common_depth.append(result[1])
    gene_fork_len.append(result[2])
    disease_fork_len.append(result[3])
features['common_word'] = common_words
features['common_depth'] = common_depth
features['gene_fork_len'] = gene_fork_len
features['disease_fork_len'] = disease_fork_len
any_negation = []
for d in docs:
    any_negation.append(int(negation_presence(d)))
features['any_negation'] = any_negation
shared_negation = []
for d, e in zip(docs, data.iterrows()):
    shared_negation.append(ancestor_negation(e[1].gene, e[1].disease, d))
features['shared_negation'] = shared_negation   
noun_negated = []
for d, e in zip(docs, data.iterrows()):
    noun_negated.append(noun_chunk_negated(e[1].gene, e[1].disease, d))
features['noun_negated'] = noun_negated
pos_cols = list(pos_counts.keys())
pos_dist_results = []
for doc in docs:
        pos_dist_results.append(pos_dist(doc, pos_counts))
pos_df = pd.DataFrame(pos_dist_results, columns=pos_cols)
features = pd.merge(features, pos_df, left_index=True, right_index=True)
chunk_root_cols = list(chunk_roots.keys())[:100]
chunk_root_results = []
for doc in docs:
     chunk_root_results.append(chunk_root_normalized(doc, chunk_roots))
root_df = pd.DataFrame(chunk_root_results, columns=chunk_root_cols)
features = pd.merge(features, root_df, left_index=True, right_index=True)
chunk_head_cols = list(chunk_heads.keys())[:100]
chunk_head_results = []
for doc in docs:
    chunk_head_results.append(chunk_head_normalized(doc, chunk_heads))
head_df = pd.DataFrame(chunk_head_results, columns=chunk_head_cols)
features = pd.merge(features, head_df, left_index=True, right_index=True, suffixes=('_root', '_head'))
features.to_csv("../corpus/features/left_features_train.csv", index=False)

In [18]:
# Could make use of a function here
data = pd.read_csv('../dataset/final_test.csv', header=None, skiprows=[0],
                  names=['id', 'assoc', 'gene', 'disease', 'sentence'])
docs = []
for index, entry in data.iterrows():
    docs.append(join_bad_splits(nlp(entry.sentence)))
# We are reusing the top 100 counts from train for chunk root/pos/head for consistency of features
# This makes the feature data frame
features = pd.DataFrame()
tree_dists = []
word_dists = []
for e, d in zip(data.iterrows(), docs):
    result = tree_distance(e[1].gene, e[1].disease, d)
    tree_dists.append(result[0])
    word_dists.append(result[1])
features['tree_dists'] = tree_dists
features['word_dists'] = word_dists
common_words = []
common_depth = []
gene_fork_len = []
disease_fork_len = []
for d, e in zip(docs, data.iterrows()):
    result = common_ancestor(e[1].gene, e[1].disease, d)[0]
    common_words.append(result[0])
    common_depth.append(result[1])
    gene_fork_len.append(result[2])
    disease_fork_len.append(result[3])
features['common_word'] = common_words
features['common_depth'] = common_depth
features['gene_fork_len'] = gene_fork_len
features['disease_fork_len'] = disease_fork_len
any_negation = []
for d in docs:
    any_negation.append(int(negation_presence(d)))
features['any_negation'] = any_negation
shared_negation = []
for d, e in zip(docs, data.iterrows()):
    shared_negation.append(ancestor_negation(e[1].gene, e[1].disease, d))
features['shared_negation'] = shared_negation   
noun_negated = []
for d, e in zip(docs, data.iterrows()):
    noun_negated.append(noun_chunk_negated(e[1].gene, e[1].disease, d))
features['noun_negated'] = noun_negated
pos_cols = list(pos_counts.keys())
pos_dist_results = []
for doc in docs:
        pos_dist_results.append(pos_dist(doc, pos_counts))
pos_df = pd.DataFrame(pos_dist_results, columns=pos_cols)
features = pd.merge(features, pos_df, left_index=True, right_index=True)
chunk_root_cols = list(chunk_roots.keys())[:100]
chunk_root_results = []
for doc in docs:
     chunk_root_results.append(chunk_root_normalized(doc, chunk_roots))
root_df = pd.DataFrame(chunk_root_results, columns=chunk_root_cols)
features = pd.merge(features, root_df, left_index=True, right_index=True)
chunk_head_cols = list(chunk_heads.keys())[:100]
chunk_head_results = []
for doc in docs:
        chunk_head_results.append(chunk_head_normalized(doc, chunk_heads))
head_df = pd.DataFrame(chunk_head_results, columns=chunk_head_cols)
features = pd.merge(features, head_df, left_index=True, right_index=True, suffixes=('_root', '_head'))
features.to_csv("../corpus/features/left_features_test.csv", index=False)