# Engineering Features for Medical NLP

Various options, suited for medical NLP

Data: 18 abstracts

# Load

In [1]:
import spacy
import pandas as pd

from collections import Counter

In [2]:
def load_data(p):
    with open(p) as f:
        x = f.read()
    x = x.split("\n\n")
    x = [i.replace("\n", " ") for i in x]
    return(x)

p = "../../data/abstracts_manual.txt"
abstracts_raw = load_data(p)

len(abstracts_raw)

18

# Models

### Select model

In [3]:
# bionlp13cg model has nice special entities
model = 'en_ner_bionlp13cg_md'
nlp = spacy.load(model)

### Process data with selected model

In [4]:
abstracts = [nlp(x) for x in abstracts_raw]

# Medical Entities
Special entities (vary depending on model. Use a medical model)

In [5]:
x = abstracts[2]

print(x.ents)

(Lung cancer, cancers, lung cancer, lung cancer, Curcumin, polyphenol  , turmeric, Curcuma longa, curcumin, anti-cancer, lung cancer, cell, STAT3, EGFR, FOXO3a, TGF-β, eIF2α, COX-2, Bcl-2, PI3KAkt/mTOR, ROS, Fas/FasL, Cdc42, E-cadherin, MMPs, adiponectin, curcumin)


In [6]:
x = abstracts[10]

print(x.ents)

(tumors, cancer cells, DNA, lethal, cellular DNA, DDRs, cells, DNA, cell, DNA, DDRs, tumor  , DDR, tumor, DDR, DDR, DDR, DNA-PKcs, DNA-dependent protein kinase, ATM/ATR, MRN, MRE11-RAD50-NBS1, PARP, poly[ADP-ribose] polymerase, MDC1, Wee1, LIG4 (, ligase IV, CDK1, BRCA1, BRCA1 C, CHK1, HIF-1 (, hypoxia-inducible factor-1)


### With metadata

In [7]:
def get_special_entities(doc):
    """https://gist.github.com/DeNeutoy/b20860b40b9fa9d33675893c56afde42#file-app-py-L121"""
    attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
    data = [
        [str(getattr(ent, attr)) for attr in attrs]
        for ent in doc.ents
    ]
    return(data, attrs)

In [8]:
x = abstracts[2]

ents, attrs = get_special_entities(x)

ents

[['Lung cancer', 'CANCER', '0', '2', '0', '11'],
 ['cancers', 'CANCER', '7', '8', '37', '44'],
 ['lung cancer', 'CANCER', '25', '27', '158', '169'],
 ['lung cancer', 'CANCER', '66', '68', '399', '410'],
 ['Curcumin', 'SIMPLE_CHEMICAL', '70', '71', '420', '428'],
 ['polyphenol  ', 'ORGANISM_SUBSTANCE', '75', '77', '450', '462'],
 ['turmeric', 'TISSUE', '79', '80', '475', '483'],
 ['Curcuma longa', 'ORGANISM', '81', '83', '485', '498'],
 ['curcumin', 'SIMPLE_CHEMICAL', '101', '102', '597', '605'],
 ['anti-cancer', 'CANCER', '103', '104', '619', '630'],
 ['lung cancer', 'CANCER', '106', '108', '642', '653'],
 ['cell', 'CELL', '115', '116', '706', '710'],
 ['STAT3', 'GENE_OR_GENE_PRODUCT', '156', '157', '961', '966'],
 ['EGFR', 'GENE_OR_GENE_PRODUCT', '158', '159', '968', '972'],
 ['FOXO3a', 'GENE_OR_GENE_PRODUCT', '160', '161', '974', '980'],
 ['TGF-β', 'GENE_OR_GENE_PRODUCT', '162', '163', '982', '987'],
 ['eIF2α', 'SIMPLE_CHEMICAL', '164', '165', '989', '994'],
 ['COX-2', 'GENE_OR_GENE_

In [9]:
# output fits into dataframe
pd.DataFrame(ents, columns=attrs)

Unnamed: 0,text,label_,start,end,start_char,end_char
0,Lung cancer,CANCER,0,2,0,11
1,cancers,CANCER,7,8,37,44
2,lung cancer,CANCER,25,27,158,169
3,lung cancer,CANCER,66,68,399,410
4,Curcumin,SIMPLE_CHEMICAL,70,71,420,428
5,polyphenol,ORGANISM_SUBSTANCE,75,77,450,462
6,turmeric,TISSUE,79,80,475,483
7,Curcuma longa,ORGANISM,81,83,485,498
8,curcumin,SIMPLE_CHEMICAL,101,102,597,605
9,anti-cancer,CANCER,103,104,619,630


### Count them

In [10]:
def get_entity_counts(doc):
    ents, _ = get_special_entities(doc)
    # count the second entries "label_"
    counts = Counter([x[1] for x in ents])
    return(counts)

In [11]:
get_entity_counts(x)

Counter({'CANCER': 6,
         'SIMPLE_CHEMICAL': 5,
         'ORGANISM_SUBSTANCE': 1,
         'TISSUE': 1,
         'ORGANISM': 1,
         'CELL': 1,
         'GENE_OR_GENE_PRODUCT': 12})

# Noun Chunks

In [12]:
x = abstracts[2]

# it's a generator
print(list(x.noun_chunks))

[Lung cancer, new intervention therapies, Curcumin, a natural occurring polyphenol, (Curcuma longa, It, curcumin, anti-cancer effects, Several in vitro and in vivo studies, these mechanisms, limitations, curcumin bioavailability, potential side effects, clinical trials]


In [13]:
x = abstracts[10]

# it's a generator
print(list(x.noun_chunks))

[Radiotherapy, a wide range, the radioresistance, Efforts, sensitizing, targets, radiosensitizers, the outcomes, DNA double-strand breaks, a series, (DDRs, these protective DDRs, tumor  radioresistance, Targeting DDR signaling pathways, an attractive strategy, tumor radioresistance, some important advances, breakthroughs, the DDR signal pathways, we, an update, We, recent advances, current clinical trials, clinical application, key DDR proteins, catalytic subunit, the MRN (MRE11-RAD50-NBS1) complex, the PARP (poly[ADP-ribose] polymerase) family, MDC1, Wee1, LIG4, (ligase, CDK1, BRCA1, (BRCA1 C terminal, CHK1, HIF-1, (hypoxia-inducible factor-1, Challenges]


# Lemmatization

In [14]:
x = abstracts[2]

for token in x[60:80]:
    print(token.text, token.lemma_, token.pos_, token.dep_)

toxicities toxicity NOUN nmod
, , PUNCT punct
have have VERB aux
been be VERB auxpass
considered consider VERB ROOT
in in ADP case
lung lung NOUN compound
cancer cancer NOUN compound
therapy therapy NOUN nmod
. . PUNCT punct
Curcumin curcumin NOUN nsubjpass
, , PUNCT punct
a a DET det
natural natural ADJ amod
occurring occur VERB amod
polyphenol polyphenol NOUN appos
    SPACE nummod
derived derive VERB acl
from from ADP case
turmeric turmeric NOUN nmod


### Count occurences of lemmatized words

Lemmatize? noun chunks??

In [46]:
# find all instances of "is"
# is, was, be, been, are, ...
word = 'is'

# what is its lemma?
selected_lemma = nlp(word)[0].lemma_

selected_lemma

'be'

In [50]:
# now, find it...
for i, abstract in enumerate(abstracts[:5]):
    print(i)
    for token in abstract:
        if token.lemma_ == selected_lemma:
            print(token.text)

0
is
is
are
be
1
are
are
is
is
2
is
been
been
been
are
were
3
are
is
4
is
are
be
been
are
is
is


# Dependency

Nominal subject, direct object, etc etc

All: https://emorynlp.github.io/nlp4j/components/dependency-parsing.html

In [100]:
# nsubj = nominal subject
kind = "nsubj"

for x in abstracts:
    print([token.text for token in x if token.dep_==kind])

['cancer', 'breakthroughs', 'they', 'patient', 'surveys', 'advent', 'we', 'review', 'which', ' ', 'directions']
['system', 'it', 'inhibitors', 'understanding', 'we']
['cancer', 'curcumin', 'studies', 'limitations']
['cancers', 'we', 'deletion', 'restoration', 'PDLIM2', 'cells', 'We', 'findings']
['Manipulation', 'that', 'antibodies', 'signals', 'MoAbs', 'understanding', 'relevance', 'challenge', 'review']
['research', 'evidence', 'mechanisms', 'Microorganisms', 'purpose', 'microbiome']
['evolution', 'it', 'therapies', 'studies', 'PD-L1', 'who', 'studies', 'review', 'we', 'that']
['cancer', 'prognosis', 'breakthrough', 'experiments', 'knowledge', 'data', 'opportunities', 'therapies', 'exploration', 'heterogeneities', 'mechanisms', 'that', 'review', ' ']
['cancer', 'diagnosis', 'characteristics', 'paper', 'efficacy', 'candidates', 'Cancers', 'analysis', 'which', 'paper', 'which']
['relationship', 'we', 'We', 'loss', 'Delivery', 'findings']
['Radiotherapy', 'radioresistance', 'Efforts', '

In [101]:
# dobj = direct object
kind = "dobj"

for x in abstracts:
    print([token.text for token in x if token.dep_==kind])

['burden', 'subset', 'shift', 'therapies', 'potential', 'landscape', 'targets', 'light', 'opportunities', 'landscape', 'state', 'outcomes', 'biology', 'evidence', 'investigations', 'insights', 'fruit']
['role', 'cells', 'progression', ' ', 'paradigm', 'activity', 'utility', 'cancer', 'knowledge', 'overview']
['effects', 'bioavailability']
['PDLIM2', 'development', 'resistance', 'activity', 'NF-κB/RelA', 'expression', 'genes', 'induction', 'rationale', 'therapies']
['growth', 'activation', 'advantages', 'mechanisms', 'interest', 'response', 'understanding', 'responses']
['interplay', 'tumor', 'production', 'role', 'strategies', 'microbiota']
['outcomes', 'outcomes', 'irrespective', 'inhibitors', 'correlations', 'patients', 'markers', 'characteristics', 'alteration', 'activity']
['approach', 'survival', 'limitations', 'heterogeneity', 'inroads', 'burden', 'those', 'repair', 'need', 'discoveries', 'chemo-sensitive', 'samples', 'techniques', 'aspiration', 'disease']
['survival', 'qualifica

# Helper: Create data frame

In [51]:
def make_feature_df(doc, expand_features=False):
    feature_names = ['data', 'tokens', 'entities_collapsed']
    # note: len(x) counts tokens because each x is a `spacy.tokens.token.Token`
    features = [(str(x), len(x), get_entity_counts(x)) for x in doc]
    df = pd.DataFrame(features, columns=feature_names)
    if expand_features:
        # expand the 'entities' into columns
        df = df.join(pd.DataFrame(df.pop('entities_collapsed').values.tolist()))
    return(df)

In [52]:
df = make_feature_df(abstracts, expand_features=False)

df.head()

Unnamed: 0,data,tokens,entities_collapsed
0,Lung cancer is the number one cause of cancer-...,300,"{'CANCER': 15, 'GENE_OR_GENE_PRODUCT': 5, 'ORG..."
1,The immune system plays a dual role in tumor e...,174,"{'CANCER': 8, 'CELL': 3, 'GENE_OR_GENE_PRODUCT..."
2,Lung cancer is among the most common cancers w...,210,"{'CANCER': 6, 'SIMPLE_CHEMICAL': 5, 'ORGANISM_..."
3,Most cancers are resistant to anti-PD-1/PD-L1 ...,169,"{'CANCER': 7, 'GENE_OR_GENE_PRODUCT': 8, 'ORGA..."
4,Manipulation of the immune response is a game ...,258,"{'CANCER': 8, 'GENE_OR_GENE_PRODUCT': 6, 'CELL..."


In [53]:
df = make_feature_df(abstracts, expand_features=True)

df.head()

Unnamed: 0,data,tokens,CANCER,GENE_OR_GENE_PRODUCT,ORGANISM,CELL,ORGAN,TISSUE,SIMPLE_CHEMICAL,ORGANISM_SUBSTANCE,MULTI_TISSUE_STRUCTURE,CELLULAR_COMPONENT,IMMATERIAL_ANATOMICAL_ENTITY,PATHOLOGICAL_FORMATION,ORGANISM_SUBDIVISION
0,Lung cancer is the number one cause of cancer-...,300,15,5.0,1.0,1.0,5.0,1.0,,,,,,,
1,The immune system plays a dual role in tumor e...,174,8,1.0,2.0,3.0,,,,,,,,,
2,Lung cancer is among the most common cancers w...,210,6,12.0,1.0,1.0,,1.0,5.0,1.0,,,,,
3,Most cancers are resistant to anti-PD-1/PD-L1 ...,169,7,8.0,2.0,3.0,,,3.0,,,,,,
4,Manipulation of the immune response is a game ...,258,8,6.0,1.0,6.0,,,3.0,,,,,,
