# Med ML NLP Playgrounds

Classify abstracts based on features engineered with `(sci)spacy`

Data: 18 abstracts

# Load

In [5]:
import spacy
import pandas as pd

from collections import Counter

In [6]:
def load_data(p):
    with open(p) as f:
        x = f.read()
    x = x.split("\n\n")
    x = [i.replace("\n", " ") for i in x]
    return(x)

p = "../../data/abstracts_manual.txt"
abstracts_raw = load_data(p)

len(abstracts_raw)

18

# Model

In [7]:
model = 'en_ner_bionlp13cg_md'
nlp = spacy.load(model)

In [8]:
abstracts = [nlp(x) for x in abstracts_raw]

# Create Features

## Count medical entities

In [21]:
def get_special_entities(doc):
    """https://gist.github.com/DeNeutoy/b20860b40b9fa9d33675893c56afde42#file-app-py-L121"""
    attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
    data = [
        [str(getattr(ent, attr)) for attr in attrs]
        for ent in doc.ents
    ]
    return(data, attrs)


def get_entity_counts(doc):
    ents, _ = get_special_entities(doc)
    # count the second entries "label_"
    counts = Counter([x[1] for x in ents])
    return(counts)


def make_feature_df(doc, expand_features=False):
    feature_names = ['data', 'tokens', 'entities_collapsed']
    # note: len(x) counts tokens because each x is a `spacy.tokens.token.Token`
    features = [(str(x), len(x), get_entity_counts(x)) for x in doc]
    df = pd.DataFrame(features, columns=feature_names)
    if expand_features:
        # expand the 'entities' into columns
        df = df.join(pd.DataFrame(df.pop('entities_collapsed').values.tolist()))
    return(df)

In [24]:
df = make_feature_df(abstracts, expand_features=True)

df.head()

Unnamed: 0,data,tokens,CANCER,GENE_OR_GENE_PRODUCT,ORGANISM,CELL,ORGAN,TISSUE,SIMPLE_CHEMICAL,ORGANISM_SUBSTANCE,MULTI_TISSUE_STRUCTURE,CELLULAR_COMPONENT,IMMATERIAL_ANATOMICAL_ENTITY,PATHOLOGICAL_FORMATION,ORGANISM_SUBDIVISION
0,Lung cancer is the number one cause of cancer-...,300,15,5.0,1.0,1.0,5.0,1.0,,,,,,,
1,The immune system plays a dual role in tumor e...,174,8,1.0,2.0,3.0,,,,,,,,,
2,Lung cancer is among the most common cancers w...,210,6,12.0,1.0,1.0,,1.0,5.0,1.0,,,,,
3,Most cancers are resistant to anti-PD-1/PD-L1 ...,169,7,8.0,2.0,3.0,,,3.0,,,,,,
4,Manipulation of the immune response is a game ...,258,8,6.0,1.0,6.0,,,3.0,,,,,,


## Todo: add count of Verbs, nouns, etc
Also, **normalize** all the counts??

See https://spacy.io/usage/spacy-101

In [65]:
x = abstracts[2]

list(x.noun_chunks)

[Lung cancer,
 new intervention therapies,
 Curcumin,
 a natural occurring polyphenol,
 (Curcuma longa,
 It,
 curcumin,
 anti-cancer effects,
 Several in vitro and in vivo studies,
 these mechanisms,
 limitations,
 curcumin bioavailability,
 potential side effects,
 clinical trials]

In [66]:
x.ents

(Lung cancer,
 cancers,
 lung cancer,
 lung cancer,
 Curcumin,
 polyphenol  ,
 turmeric,
 Curcuma longa,
 curcumin,
 anti-cancer,
 lung cancer,
 cell,
 STAT3,
 EGFR,
 FOXO3a,
 TGF-β,
 eIF2α,
 COX-2,
 Bcl-2,
 PI3KAkt/mTOR,
 ROS,
 Fas/FasL,
 Cdc42,
 E-cadherin,
 MMPs,
 adiponectin,
 curcumin)

In [60]:
x.noun_chunks

generator

In [37]:
[token.text for token in abstracts[2] if token.dep_=="dobj"]

['effects', 'bioavailability']

In [52]:
for token in abstracts[5][:10]:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Recent recent ADJ amod
research research NOUN nsubj
on on ADP case
cancer-associated cancer-associated ADJ amod
microbial microbial ADJ amod
communities community NOUN nmod
has have VERB aux
elucidated elucidate VERB ROOT
the the DET det
interplay interplay NOUN dobj


In [68]:
for token in x:
    if token.lemma_ == 'cancer':
        print('a')

a
a
a
a
a


## Dependency of given word

Find occurences of a word such as "cancer", lemmatize it, and find and counts its dep_

In [48]:
for token in abstracts[5]:
    if str(token.text) == 'cancer':
        print("a")

a


In [49]:
abstracts[5]

Recent research on cancer-associated microbial communities has elucidated the interplay between bacteria, immune cells, and tumor cells; the bacterial pathways involved in the induction of carcinogenesis; and their clinical significance. Although accumulating evidence shows that a dysbiotic condition is associated with lung carcinogenesis, the underlying mechanisms remain unclear. Microorganisms possibly trigger tumor initiation and progression, presumably via  the production of bacterial toxins and other pro-inflammatory factors. The purpose of this review is to discuss the basic role of the airway microbiome in carcinogenesis and the underlying molecular mechanisms, with the aim of developing anticancer strategies involving the airway microbiota. In addition, the mechanisms via which the microbiome acts as a modulator of immunotherapies in lung cancer are summarized.