In [1]:
!pip install scispacy



In [0]:
import scispacy
import spacy

In [0]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_core_sci_sm-0.2.0.tar.gz

In [0]:

!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_core_sci_md-0.2.0.tar.gz 

In [0]:
# Doesnt work
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_core_sci_lg-0.2.3.tar.gz 

In [0]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_ner_bc5cdr_md-0.2.0.tar.gz 

In [0]:

!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_ner_bionlp13cg_md-0.2.0.tar.gz 

In [0]:
import scispacy
import spacy
from spacy import displacy
from collections import Counter
#import en_core_web_sm
import en_ner_bc5cdr_md
#import en_core_sci_sm
import en_core_sci_md
import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker
from collections import OrderedDict
from pprint import pprint


In [0]:
text = """
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
"""

In [0]:
def display_entities(model,document):
    """ 
    This function displays word entities

    Parameters: 
         model(module): A pretrained model from spaCy(https://spacy.io/models) or ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: Image rendering and list of named/unnamed word entities and entity labels 
     """
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = pprint(set([(X.text, X.label_) for X in doc.ents]))
    return  displacy_image, entity_and_label

In [0]:
def show_medical_abbreviation(model,document):
    """ 
    This function detects and resolves medical abbreviations in word entities

    Parameters: 
         model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: List of unique abbreviations and their resolution 
     """
    nlp = model.load()
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    doc = nlp(document)
    abbreviated = list(set([f"{abrv}  {abrv._.long_form}" for abrv in doc._.abbreviations]))             #list is set to ensure only unique values are returned
    return abbreviated

In [0]:
def unified_medical_language_entity_linker(model,document):
    """ 
    This function links named entities to the Unified Medical Language System UMLS (https://www.nlm.nih.gov/research/umls/)

    Parameters: 
         model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: Attributes of Named entities accessible in the Unified Medical Language System database
     """
    nlp = model.load()
    linker = UmlsEntityLinker(k=10,max_entities_per_mention = 2)    #parameters are tunable
    nlp.add_pipe(linker)
    doc = nlp(document)
    entity = doc.ents
    entity = [str(item) for item in entity]               # convert each entity tuple to list of strings
    entity = str(OrderedDict.fromkeys(entity))            # returns unique entities only
    entity = nlp(entity).ents                             # convert unique entities back to '.ents' object
    for entity in entity:
        for umls_ent in entity._.umls_ents:
            print("Entity Name:" ,entity)
            Concept_Id, Score = umls_ent
            print('Concept_Id = {} Score = {}'.format(Concept_Id,Score))
            print(linker.umls.cui_to_entity[umls_ent[0]])

In [22]:
display_entities(en_core_sci_md,text)

{('HCC', 'ENTITY'),
 ('MDSC', 'ENTITY'),
 ('cancer', 'ENTITY'),
 ('hepatocellular \ncarcinoma', 'ENTITY'),
 ('humans', 'ENTITY'),
 ('immature', 'ENTITY'),
 ('immunosuppressive activity', 'ENTITY'),
 ('suppressor cells', 'ENTITY'),
 ('tumor-bearing mice', 'ENTITY')}


(None, None)

In [23]:
display_entities(en_ner_bionlp13cg_md,text)

{('HCC', 'CANCER'),
 ('MDSC', 'CANCER'),
 ('cancer', 'CANCER'),
 ('cells', 'CELL'),
 ('hepatocellular \ncarcinoma', 'CANCER'),
 ('humans \n', 'ORGANISM'),
 ('mice', 'ORGANISM')}


(None, None)

In [21]:
display_entities(en_ner_bc5cdr_md,text)

{('HCC', 'DISEASE'),
 ('cancer', 'DISEASE'),
 ('hepatocellular \ncarcinoma', 'DISEASE')}


(None, None)

In [25]:
show_medical_abbreviation(en_ner_bc5cdr_md,text)

['HCC  hepatocellular \ncarcinoma', 'MDSC  Myeloid derived suppressor cells']

In [26]:
show_medical_abbreviation(en_ner_bionlp13cg_md,text)

['HCC  hepatocellular \ncarcinoma', 'MDSC  Myeloid derived suppressor cells']

In [28]:
unified_medical_language_entity_linker(en_ner_bc5cdr_md,text)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpf9ewn_sj
Finished download, copying /tmp/tmpf9ewn_sj to cache at /root/.scispacy/datasets/ea855fd121a193f03190a91417c209d4cd97e63d3ce4b456c248ef7c13a4ca77.03518aabd12de2103a27a50302f37c3d87b0f313a8be08b5ec306c9c4334b9b1.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin not found in cache, downloading to /tmp/tmpuuqie59h
Finished download, copying /tmp/tmpuuqie59h to cache at /root/.scispacy/datasets/5f620d1bd549a98c005ed601a73806ea2cd1a86ae6c54bbc62bcb3b452ca2630.27a7ac6807fde6628311ff7d70b86fefc640d0eb70637b544c591722a2c16c2a.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpa0refxuz
Finished download, copying /tmp/tmpa0refxuz to cache at /root/.scispacy/datasets/ffb7a77cdcb3c9233c1e400



https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json not found in cache, downloading to /tmp/tmprr9076ye
Finished download, copying /tmp/tmprr9076ye to cache at /root/.scispacy/datasets/0f064d20aefab965d5772b2100f8436b3541e7d5313c76cfe5fe070902f149fe.31df9cdb04729860a81bd6c980224ed2bff582586c398d0c9b96ae4e257b9da2.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json not found in cache, downloading to /tmp/tmph6fnmrn8
Finished download, copying /tmp/tmph6fnmrn8 to cache at /root/.scispacy/datasets/13b30cd31cd37c1b52f3df6ea023061172d16e9941660e677fdbb29489af7410.4ad71d86ce780e00cab131c7e3b81acfd2f11dd80ccd61125c8bcde506f2ab8a.umls_2017_aa_cat0129.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpyarukvln
Finished download, copying /tmp/tmpyarukvln to cache at /root/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f

In [29]:
unified_medical_language_entity_linker(en_ner_bionlp13cg_md,text)



Entity Name: mice
Concept_Id = C0025914 Score = 1.0
CUI: C0025914, Name: House mice
Definition: The common house mouse, often used as an experimental organism.
TUI(s): T015
Aliases (abbreviated, total: 32): 
	 Mice, House, House Mice, house mice, House Mouse, House Mouse, Mouse, House, house mouse, house mouse, House mouse, Mouse
Entity Name: mice
Concept_Id = C0025929 Score = 1.0
CUI: C0025929, Name: Laboratory mice
Definition: mouse that has been bred or acquired for the purpose of being a research subject in a facility where scientific research and experiments are conducted.
TUI(s): T015
Aliases (abbreviated, total: 13): 
	 Mice, Laboratory, Laboratory Mice, laboratory mice, Laboratory Mouse, Mouse, Laboratory, laboratory mouse, laboratory mouse, Laboratory mouse, Mouse, laboratory, mice
Entity Name: humans
Concept_Id = C0086418 Score = 1.0
CUI: C0086418, Name: Homo sapiens
Definition: Members of the species Homo sapiens.
TUI(s): T016
Aliases (abbreviated, total: 36): 
	 Homo sapien