In [13]:
import os
import spacy
import pandas as pd
import networkx as nx
from collections import deque
nlp = spacy.load('en_core_web_sm')

In [43]:
with open('../corpus/preprocessed/20001230.tsv') as fp:
    fp.readline()
    line = fp.readline().split('\t', 1)[1].rstrip()
tags = pd.read_csv('../corpus/preprocessed_tagged/20001230.tsv',
                          sep='\t', usecols=[3, 4])
tags = pd.Series(tags['BIOENTITY'].values, index=tags['VALUE']).to_dict()
terms = tags.keys()
doc = nlp(line)

{'CHILDHOOD_ACUTE_MYELOID_LEUKEMIA': 'Disease',
 'AML': 'Disease',
 'FLT3': 'Gene',
 'NPM1': 'Gene',
 'AKT1': 'Gene',
 'RUNX1': 'Gene',
 'RUNX1T1': 'Gene',
 'JAK2': 'Gene',
 'CANCER': 'Disease'}

In [47]:
# Get the tags in each sentence by token position and create a graph of the tokens in the paragraph
edges = []
doc_terms = []
for sent in doc.sents:
    sent_terms = []
    for token in sent:
        token_format = '{0}-{1}'.format(token.text, token.i)
        if token.text in terms:
            sent_terms.append(token_format)
        for child in token.children:
            edges.append((token_format, '{0}-{1}'.format(child.text, child.i)))
    doc_terms.append(sent_terms)
    graph = nx.Graph(edges)

In [65]:
doc_terms

[['CHILDHOOD_ACUTE_MYELOID_LEUKEMIA-9', 'AML-11', 'FLT3-35', 'NPM1-37'],
 ['NPM1-68'],
 [],
 [],
 ['AKT1-114',
  'RUNX1-116',
  'RUNX1T1-122',
  'JAK2-125',
  'AML-138',
  'CANCER-143',
  'AML-151'],
 [],
 [],
 []]

In [49]:
# Get the unique pairs of gene/disease in the sentence
paths_to_get = []
for sent in doc_terms:
    if len(sent) < 2:
        continue
    formated = [x.rsplit('-', 1)[0] for x in sent]
    for i in range(len(sent)):
        bio_ent = tags[formated[i]]
        for j in range(i+1, len(sent)):
            if tags[formated[j]] != bio_ent:
                paths_to_get.append((sent[i], sent[j]))

In [54]:
# Print the first term, second term w/ token positions, dep tree dist, word dist
for i in paths_to_get:
    print(i[0], i[1], nx.shortest_path_length(graph, source=i[0], target=i[1]),
          int(i[1].rsplit('-', 1)[1]) - int(i[0].rsplit('-', 1)[1]))

CHILDHOOD_ACUTE_MYELOID_LEUKEMIA-9 FLT3-35 12 26
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA-9 NPM1-37 12 28
AML-11 FLT3-35 13 24
AML-11 NPM1-37 13 26
AKT1-114 AML-138 5 24
AKT1-114 CANCER-143 5 29
AKT1-114 AML-151 6 37
RUNX1-116 AML-138 6 22
RUNX1-116 CANCER-143 6 27
RUNX1-116 AML-151 7 35
RUNX1T1-122 AML-138 9 16
RUNX1T1-122 CANCER-143 9 21
RUNX1T1-122 AML-151 10 29
JAK2-125 AML-138 10 13
JAK2-125 CANCER-143 10 18
JAK2-125 AML-151 11 26


In [64]:
tag_pos = [int(x.rsplit('-', 1)[1]) for x in [i for s in doc_terms for i in s]]
for i in tag_pos:
    for offset in [-2, -1, 1, 2]:
        word = doc[i + offset]
        print(word.text, word.pos_, i+offset)

diagnosed VERB 7
with ADP 8
( PUNCT 10
AML PROPN 11
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA PROPN 9
( PUNCT 10
) PUNCT 12
using VERB 13
mutations NOUN 33
in ADP 34
and CCONJ 36
NPM1 PROPN 37
FLT3 PROPN 35
and CCONJ 36
genes NOUN 38
, PUNCT 39
, PUNCT 66
and CCONJ 67
mutations NOUN 69
in ADP 70
genes NOUN 112
( PUNCT 113
, PUNCT 115
RUNX1 PROPN 116
AKT1 PROPN 114
, PUNCT 115
, PUNCT 117
LTB PROPN 118
SDC1 PROPN 120
, PUNCT 121
, PUNCT 123
and CCONJ 124
, PUNCT 123
and CCONJ 124
) PUNCT 126
from ADP 127
involved VERB 136
in ADP 137
, PUNCT 139
whereas ADP 140
other ADJ 141
30 NUM 142
genes NOUN 144
, PUNCT 145
in ADP 149
an DET 150
context NOUN 152
, PUNCT 153


In [None]:
# Distance from root to gene

In [46]:
print('text\tPOS\tTAG\tdependency')
for sent in doc.sents:
    for token in sent:
        print(token.text + '\t' + token.pos_ + '\t' + token.tag_ + '\t' + token.dep_)
    print('\n')

text	POS	TAG	dependency
We	PRON	PRP	nsubj
studied	VERB	VBD	ROOT
a	DET	DT	det
series	NOUN	NN	dobj
of	ADP	IN	prep
68	NUM	CD	nummod
subjects	NOUN	NNS	pobj
diagnosed	VERB	VBN	acl
with	ADP	IN	prep
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA	PROPN	NNP	pobj
(	PUNCT	-LRB-	punct
AML	PROPN	NNP	appos
)	PUNCT	-RRB-	punct
using	VERB	VBG	advcl
conventional	ADJ	JJ	amod
cytogenetics	NOUN	NNS	dobj
and	CCONJ	CC	cc
fluorescence	NOUN	NN	conj
in	ADP	IN	prep
situ	NOUN	NN	amod
hybridization	NOUN	NN	pobj
(	PUNCT	-LRB-	punct
FISH	NOUN	NN	appos
)	PUNCT	-RRB-	punct
,	PUNCT	,	punct
polymerase	NOUN	NN	compound
chain	NOUN	NN	compound
reaction	NOUN	NN	nsubj
(	PUNCT	-LRB-	punct
PCR	PROPN	NNP	appos
)	PUNCT	-RRB-	punct
to	PART	TO	aux
analyze	VERB	VB	relcl
mutations	NOUN	NNS	dobj
in	ADP	IN	prep
FLT3	PROPN	NNP	pobj
and	CCONJ	CC	cc
NPM1	PROPN	NNP	conj
genes	NOUN	NNS	pobj
,	PUNCT	,	punct
and/or	CCONJ	CC	aux
array	VERB	VBP	conj
comparative	ADJ	JJ	amod
genomic	ADJ	JJ	amod
hybridization	NOUN	NN	dobj
(	PUNCT	-LRB-	punct
CGH	PROPN	NNP	app

<function spacy.glossary.explain(term)>

In [38]:
for token in doc:
    if token.dep_ == 'ROOT':
        print(token.text)

studied
observed
observed
were
reported
correspond
tested
correlated


In [57]:
print('Noun Chunk dependencies')
print('NounChunk\tChunkRoot\tChunkDep\tChunkHeadText')
for sent in doc.sents:
    for chunk in sent.noun_chunks:
        print(chunk.text, '\t', chunk.root.text, '\t', chunk.root.dep_, '\t',
                chunk.root.head.text)
    print('\n')

Noun Chunk dependencies
NounChunk	ChunkRoot	ChunkDep	ChunkHeadText
We 	 We 	 nsubj 	 studied
a series 	 series 	 dobj 	 studied
68 subjects 	 subjects 	 pobj 	 of
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA 	 CHILDHOOD_ACUTE_MYELOID_LEUKEMIA 	 pobj 	 with
AML 	 AML 	 appos 	 CHILDHOOD_ACUTE_MYELOID_LEUKEMIA
conventional cytogenetics 	 cytogenetics 	 dobj 	 using
fluorescence 	 fluorescence 	 conj 	 cytogenetics
situ hybridization 	 hybridization 	 pobj 	 in
FISH 	 FISH 	 appos 	 hybridization
polymerase chain reaction 	 reaction 	 nsubj 	 array
(PCR 	 PCR 	 appos 	 reaction
mutations 	 mutations 	 dobj 	 analyze
FLT3 	 FLT3 	 pobj 	 in
genes 	 genes 	 pobj 	 in
comparative genomic hybridization 	 hybridization 	 dobj 	 array
CGH 	 CGH 	 appos 	 hybridization


Cytogenetic/FISH abnormalities 	 abnormalities 	 nsubjpass 	 observed
71% 	 % 	 pobj 	 in
subjects 	 subjects 	 pobj 	 of
FLT3-ITD mutations 	 mutations 	 appos 	 %
15% 	 % 	 pobj 	 in
NPM1 	 NPM1 	 conj 	 mutations
mutations 	 mutations 	 

In [68]:
vocab = ['CHILDHOOD_ACUTE_MYELOID_LEUKEMIA', 'AML', 'FLT3', 'AKT1', 'NPM1', 'RUNX1', 'RUNX1T1',
         'JAK2', 'CANCER']
print('TERM\tLEFT_CHILDREN\tRIGHT_CHILDREN\tDEPENDS_ON')
for sent in doc.sents:
    for token in sent:
        if token.text in vocab:
            print(token.text, '\t', [t.text for t in token.lefts], '\t', [t.text for t in token.rights],
                  '\t', [t.text for t in token.ancestors])
    print('\n')

TERM	LEFT_CHILDREN	RIGHT_CHILDREN	DEPENDS_ON
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA 	 [] 	 ['(', 'AML', ')'] 	 ['with', 'diagnosed', 'subjects', 'of', 'series', 'studied']
AML 	 [] 	 [] 	 ['CHILDHOOD_ACUTE_MYELOID_LEUKEMIA', 'with', 'diagnosed', 'subjects', 'of', 'series', 'studied']
FLT3 	 [] 	 [] 	 ['in', 'mutations', 'analyze', 'reaction', 'array', 'studied']
NPM1 	 [] 	 [] 	 ['in', 'mutations', 'analyze', 'reaction', 'array', 'studied']


NPM1 	 [] 	 ['mutations', 'in'] 	 ['mutations', '%', 'in', 'observed']






AKT1 	 [] 	 [',', 'RUNX1'] 	 ['genes', 'reported']
RUNX1 	 [] 	 [',', 'LTB'] 	 ['AKT1', 'genes', 'reported']
RUNX1T1 	 [] 	 [',', 'and', 'JAK2'] 	 ['SDC1', 'LTB', 'RUNX1', 'AKT1', 'genes', 'reported']
JAK2 	 [] 	 [')'] 	 ['RUNX1T1', 'SDC1', 'LTB', 'RUNX1', 'AKT1', 'genes', 'reported']
AML 	 [] 	 [] 	 ['in', 'involved', 'reported']
CANCER 	 [] 	 [] 	 ['genes', 'reported', 'reported']
AML 	 [] 	 [] 	 ['context', 'in', 'reported', 'reported']










In [75]:
vocab = ['CHILDHOOD_ACUTE_MYELOID_LEUKEMIA', 'AML', 'FLT3', 'AKT1', 'NPM1', 'RUNX1', 'RUNX1T1',
         'JAK2', 'CANCER']
print('TERM\tNEIGHBORS')
for sent in doc.sents:
    for token in sent:
        if token.text in vocab:
            print(token.text, [token.nbor(i) for i in [-2, -1, 1, 2]])
    print('\n')

TERM	NEIGHBORS
CHILDHOOD_ACUTE_MYELOID_LEUKEMIA [diagnosed, with, (, AML]
AML [CHILDHOOD_ACUTE_MYELOID_LEUKEMIA, (, ), using]
FLT3 [mutations, in, and, NPM1]
NPM1 [FLT3, and, genes, ,]


NPM1 [,, and, mutations, in]






AKT1 [genes, (, ,, RUNX1]
RUNX1 [AKT1, ,, ,, LTB]
RUNX1T1 [SDC1, ,, ,, and]
JAK2 [,, and, ), from]
AML [involved, in, ,, whereas]
CANCER [other, 30, genes, ,]
AML [in, an, context, ,]










## Runs the features for all of the corpus

In [23]:
files = os.listdir('../corpus/preprocessed')
i = 0
for f in files:
    with open(os.path.join('../corpus/preprocessed/', f)) as fp:
        fp.readline()
        try:
            line = fp.readline().split('\t', 1)[1].rstrip()
        except:
            # If the abstract can't be processed do nothing
            continue
    doc = nlp(line)
    with open(os.path.join('../corpus/features/pos_tag_dependency/', f), 'w') as fp:
        fp.write('text\tPOS\tTAG\tdependency\n')
        for sent in doc.sents:
            for token in sent:
                fp.write(token.text + '\t' + token.pos_ + '\t' + token.tag_
                             + '\t' + token.dep_ + '\n')
            fp.write('\n')
    try:
        terms = pd.read_csv(os.path.join('../corpus/preprocessed_tagged/', f),
                            sep='\t', usecols=[3])['VALUE'].unique()
    except:
        #There are no terms?
        continue
    with open(os.path.join('../corpus/features/dependent_words/', f), 'w') as fp:
        fp.write('TERM\tLEFT_CHILDREN\tRIGHT_CHILDREN\tDEPENDS_ON\n')
        for sent in doc.sents:
            for token in sent:
                if token.text in terms:
                    fp.write(token.text + '\t' + str([t.text for t in token.lefts]) +
                             '\t' + str([t.text for t in token.rights]) + '\t' +
                             str([t.text for t in token.ancestors]) + '\n')
            fp.write('\n')
    i += 1
    if i % 100 == 0:
        print(i)

{'a', 'b'}