# Med ML NLP Playgrounds

Classify abstracts based on features engineered with `(sci)spacy`

Data: 18 abstracts

# Load

In [15]:
import spacy
import pandas as pd

from collections import Counter

In [43]:
def load_data(p):
    with open(p) as f:
        x = f.read()
    x = x.split("\n\n")
    x = [i.replace("\n", " ") for i in x]
    return(x)

p = "../../data/abstracts_manual.txt"
abstracts_raw = load_data(p)

len(abstracts_raw)

18

# Model

In [44]:
model = 'en_ner_bionlp13cg_md'
nlp = spacy.load(model)

In [45]:
abstracts = [nlp(x) for x in abstracts_raw]

# Create Features

## Count medical entities

In [63]:
def get_special_entities(doc):
    """https://gist.github.com/DeNeutoy/b20860b40b9fa9d33675893c56afde42#file-app-py-L121"""
    attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
    data = [
        [str(getattr(ent, attr)) for attr in attrs]
        for ent in doc.ents
    ]
    return(data, attrs)


def get_entity_counts(doc):
    ents, _ = get_special_entities(doc)
    # count the second entries "label_"
    counts = Counter([x[1] for x in ents])
    return(counts)

def make_feature_df(doc, expand_features=False):
    counts = [(str(x), len(x), get_entity_counts(x)) for x in doc]
    df = pd.DataFrame(counts, columns=['data', 'length', 'entities'])
    if expand_features:
        # expand the 'entities' into columns
        df = df.join(pd.DataFrame(df.pop('entities').values.tolist()))
    return(df)

In [65]:
df = make_feature_df(abstracts, expand_features=True)

df.head()

Unnamed: 0,data,length,CANCER,GENE_OR_GENE_PRODUCT,ORGANISM,CELL,ORGAN,TISSUE,SIMPLE_CHEMICAL,ORGANISM_SUBSTANCE,MULTI_TISSUE_STRUCTURE,CELLULAR_COMPONENT,IMMATERIAL_ANATOMICAL_ENTITY,PATHOLOGICAL_FORMATION,ORGANISM_SUBDIVISION
0,Lung cancer is the number one cause of cancer-...,300,15,5.0,1.0,1.0,5.0,1.0,,,,,,,
1,The immune system plays a dual role in tumor e...,174,8,1.0,2.0,3.0,,,,,,,,,
2,Lung cancer is among the most common cancers w...,210,6,12.0,1.0,1.0,,1.0,5.0,1.0,,,,,
3,Most cancers are resistant to anti-PD-1/PD-L1 ...,169,7,8.0,2.0,3.0,,,3.0,,,,,,
4,Manipulation of the immune response is a game ...,258,8,6.0,1.0,6.0,,,3.0,,,,,,


## Todo: for length, use tokens

In [72]:
len([token for token in abstracts[1]])

174

## Todo: add count of Verbs, nounds, etc
Also, **normalize** all the counts??