# Biomedical Named Entity Recognition

In [1]:
import spacy
import scispacy

In [2]:
from spacy import displacy

In [3]:
text = open('diabetes.txt').read()

In [4]:
text[:100]

'The prevalence of diabetes is rapidly rising all over the globe at an alarming rate. Over the last t'

In [5]:
text[-100:]

'resistance and secretory defect during pregnancy results in GDM [39-45].\nDiabetic ketoacidosis (DKA)'

In [6]:
len(text)

11409

In [7]:
spacynlp = spacy.load('en_core_web_sm')
scinlp = spacy.load('en_core_sci_sm')


   **Python function display_entities()accepts a model and document to return a displacy image and word entities. The function will be used on three different scispaCy models and the tests document. The function can be adjusted as needed. E.g To view dependency parsing instead of entities use displacy.render(doc,jupyter=True,style='dep')**

In [13]:
def display_entities(model, text):
    scinlp = spacy.load(model)
    doc = scinlp(text)
    displacy_image = displacy.render(doc, style = 'ent')
    entity_label = set([(token.text, token.label_) for token in doc.ents])
    return  entity_label

In [14]:
display_entities('en_core_web_sm', text)

{('1', 'CARDINAL'),
 ('2', 'CARDINAL'),
 ('2025', 'DATE'),
 ('27', 'CARDINAL'),
 ('2nd', 'ORDINAL'),
 ('39', 'CARDINAL'),
 ('4/4', 'CARDINAL'),
 ('6', 'CARDINAL'),
 ('67%', 'PERCENT'),
 ('69.9 million', 'CARDINAL'),
 ('Addison', 'PERSON'),
 ('Caucasians', 'NORP'),
 ('DM', 'ORG'),
 ('Forty', 'CARDINAL'),
 ('GAD', 'ORG'),
 ('GDM', 'ORG'),
 ('Hashimoto', 'GPE'),
 ('Hyperglycemia', 'PRODUCT'),
 ('India', 'GPE'),
 ('Insulin', 'ORG'),
 ('Insulin', 'PERSON'),
 ('Latino', 'PERSON'),
 ('Seshiah et', 'PERSON'),
 ('T1DM', 'GPE'),
 ('TRIGR', 'ORG'),
 ('VAT', 'ORG'),
 ('VAT', 'PRODUCT'),
 ('approximately 5%', 'PERCENT'),
 ('approximately 7%', 'PERCENT'),
 ('around 40.9 million', 'CARDINAL'),
 ('daily', 'DATE'),
 ('first', 'ORDINAL'),
 ('hyperglucagonaemia', 'GPE'),
 ('insulin autoantibody', 'PERSON'),
 ('meta-analysis[26', 'PERSON'),
 ('more than 2,00,000', 'CARDINAL'),
 ('obese', 'NORP'),
 ('one', 'CARDINAL'),
 ('pregnancy[22', 'PRODUCT'),
 ('second', 'ORDINAL'),
 ('the International Diabetes Fede

In [15]:
display_entities('en_core_sci_sm', text)

{('Abnormalities', 'ENTITY'),
 ('Addison’s disease', 'ENTITY'),
 ('Autoimmunity', 'ENTITY'),
 ('Caucasians', 'ENTITY'),
 ('Chronic fuel', 'ENTITY'),
 ('Cytokines', 'ENTITY'),
 ('DIABETES', 'ENTITY'),
 ('DKA', 'ENTITY'),
 ('DM', 'ENTITY'),
 ('DQ', 'ENTITY'),
 ('DR', 'ENTITY'),
 ('DR3/4', 'ENTITY'),
 ('Diabetes', 'ENTITY'),
 ('Diabetes mellitus', 'ENTITY'),
 ('Diabetic acidosis', 'ENTITY'),
 ('Diabetic ketoacidosis', 'ENTITY'),
 ('Eisenbarth', 'ENTITY'),
 ('Exposure to', 'ENTITY'),
 ('GAD', 'ENTITY'),
 ('GDM', 'ENTITY'),
 ('GLP-1', 'ENTITY'),
 ('Gestational diabetes', 'ENTITY'),
 ('Gestational diabetes mellitus', 'ENTITY'),
 ('HLA', 'ENTITY'),
 ('Hashimoto’s thyroiditis', 'ENTITY'),
 ('Hyperglycemia', 'ENTITY'),
 ('IAA', 'ENTITY'),
 ('ICA512/IA-2', 'ENTITY'),
 ('India', 'ENTITY'),
 ('Insulin', 'ENTITY'),
 ('Insulin resistance', 'ENTITY'),
 ('Insulitis', 'ENTITY'),
 ('International Diabetes', 'ENTITY'),
 ('Ketones', 'ENTITY'),
 ('Latino women', 'ENTITY'),
 ('Mechanism\nDiabetes mellitus',

In [16]:
display_entities('en_ner_bc5cdr_md', text)

{('Addison’s disease', 'DISEASE'),
 ('Autoimmunity', 'DISEASE'),
 ('DKA', 'DISEASE'),
 ('DM', 'DISEASE'),
 ('DR', 'DISEASE'),
 ('Diabetes', 'DISEASE'),
 ('Diabetes mellitus', 'DISEASE'),
 ('Diabetic acidosis', 'DISEASE'),
 ('Diabetic ketoacidosis', 'DISEASE'),
 ('Eisenbarth', 'CHEMICAL'),
 ('GDM', 'DISEASE'),
 ('Gestational diabetes', 'DISEASE'),
 ('Gestational diabetes mellitus', 'DISEASE'),
 ('Hashimoto’s thyroiditis', 'DISEASE'),
 ('Hyperglycemia', 'DISEASE'),
 ('IAA', 'CHEMICAL'),
 ('Insulitis', 'DISEASE'),
 ('Ketones', 'CHEMICAL'),
 ('Overweight', 'DISEASE'),
 ('Seshiah', 'CHEMICAL'),
 ('T1DM', 'DISEASE'),
 ('TRIGR', 'DISEASE'),
 ('TrialNet', 'CHEMICAL'),
 ('Type 1 diabetes\nT1DM', 'DISEASE'),
 ('Type 2 diabetes', 'DISEASE'),
 ('amyloid formation[18–20', 'CHEMICAL'),
 ('autoimmune disease', 'DISEASE'),
 ('autoimmune diseases', 'DISEASE'),
 ('autoimmunity', 'DISEASE'),
 ('blood glucose', 'DISEASE'),
 ('blood sugar', 'DISEASE'),
 ('carbohydrate intolerance', 'DISEASE'),
 ('celiac di

In [17]:
display_entities('en_ner_bionlp13cg_md', text)

{('18.9%[24', 'SIMPLE_CHEMICAL'),
 ('Cytokines', 'CELLULAR_COMPONENT'),
 ('DKA', 'SIMPLE_CHEMICAL'),
 ('DQ', 'GENE_OR_GENE_PRODUCT'),
 ('DR 4/4', 'GENE_OR_GENE_PRODUCT'),
 ('GAD', 'GENE_OR_GENE_PRODUCT'),
 ('GDM', 'CANCER'),
 ('GDM', 'PATHOLOGICAL_FORMATION'),
 ('GLP-1', 'SIMPLE_CHEMICAL'),
 ('IAA', 'SIMPLE_CHEMICAL'),
 ('Insulin', 'GENE_OR_GENE_PRODUCT'),
 ('Ketones', 'SIMPLE_CHEMICAL'),
 ('Latino women', 'ORGANISM'),
 ('Patients', 'ORGANISM'),
 ('People', 'ORGANISM'),
 ('SAT', 'TISSUE'),
 ('T-cell', 'CELL'),
 ('VAT', 'TISSUE'),
 ('adiponectin', 'GENE_OR_GENE_PRODUCT'),
 ('adipose tissue', 'TISSUE'),
 ('amyloid formation[18–20', 'GENE_OR_GENE_PRODUCT'),
 ('beta cell', 'CELL'),
 ('blood', 'ORGANISM_SUBSTANCE'),
 ('blood glucose', 'ORGANISM_SUBSTANCE'),
 ('blood sugar', 'MULTI-TISSUE_STRUCTURE'),
 ('blood sugar', 'ORGANISM_SUBSTANCE'),
 ('bloodstream', 'ORGANISM'),
 ('body', 'ORGAN'),
 ('body', 'ORGANISM_SUBDIVISION'),
 ('body', 'TISSUE'),
 ('brain', 'ORGAN'),
 ('carbohydrate', 'SIMPLE_

   **The function show_medical_abbreviation() accepts a model and document to return abbreviated words and their resolutions. The function can be adjusted as needed. I set the list so only unique values are returned**

In [18]:
from scispacy.abbreviation import AbbreviationDetector

In [19]:
def show_med_abbreviation(model, text):
    scinlp = spacy.load(model)
    abbreviation_pipe = AbbreviationDetector(scinlp)
    scinlp.add_pipe(abbreviation_pipe)
    doc = scinlp(text)
    abbreviated = list(set([f'{abrv} {abrv._.long_form}' for abrv in doc._.abbreviations]))
    return  abbreviated

In [20]:
show_med_abbreviation('en_ner_bc5cdr_md', text)

['GLP-1 glucagon-like peptide 1',
 'GAD glutamic acid decarboxylase',
 'SAT subcutaneous adipose tissue',
 'DKA Diabetic ketoacidosis',
 'DM Diabetes mellitus',
 'IAA insulin autoantibody',
 'VAT visceral adipose tissue',
 'GDM Gestational diabetes mellitus']

References

scispaCy is a Python package containing spaCy models for processing biomedical, scientific or clinical text. https://allenai.github.io/scispacy/


https://medium.com/@oyewusiwuraola/how-to-use-scispacy-for-biomedical-named-entity-recognition-abbreviation-resolution-and-link-umls-87d3f7c08db2