In [1]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_sm")
text = """
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
"""
doc = nlp(text)

print(list(doc.sents))

[
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
, They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
]


In [2]:
# Examine the entities extracted by the mention detector.
# Note that they don't have types like in SpaCy, and they
# are more general (e.g including verbs) - these are any
# spans which might be an entity in UMLS, a large
# biomedical database.
print(doc.ents)

(suppressor cells, MDSC, immature, immunosuppressive activity, accumulate, tumor-bearing mice, humans, cancer, hepatocellular 
carcinoma, HCC)


In [5]:
from spacy import displacy
from sagas.nlu.spacy_helper import spacy_mgr, vis
displacy.render(next(doc.sents), style='dep', jupyter=True)
# vis(doc, text)

In [6]:
from scispacy.abbreviation import AbbreviationDetector

# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [1]:
import spacy
import scispacy

from scispacy.umls_linking import UmlsEntityLinker

nlp = spacy.load("en_core_sci_sm")
# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
# linker = UmlsEntityLinker(resolve_abbreviations=True)

In [19]:
DEFAULT_PATHS = {
    "ann_index": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin",
    "tfidf_vectorizer": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib",  # noqa
    "tfidf_umls_vectors": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz",  # noqa
    "concept_aliases_list": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json",  # noqa
}
# url=DEFAULT_PATHS['tfidf_umls_vectors']
# url=DEFAULT_PATHS['ann_index']
url=DEFAULT_PATHS['concept_aliases_list']

In [23]:
DEFAULT_UMLS_PATH = (
    "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json"
)
DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"
url=DEFAULT_UMLS_PATH

In [24]:
from scispacy.file_cache import url_to_filename
from pathlib import Path
import os
import requests

CACHE_ROOT = Path(os.getenv("SCISPACY_CACHE", str(Path.home() / ".scispacy")))
DATASET_CACHE = str(CACHE_ROOT / "datasets")
cache_dir = DATASET_CACHE
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
    raise IOError(
        "HEAD request failed for url {} with status code {}".format(
            url, response.status_code
        )
    )
etag = response.headers.get("ETag")
etag

'"8c805740f09578d874ee1c02f03cfd45-61"'

In [25]:
filename = url_to_filename(url, etag)

# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
cache_path

'/Users/xiaofeiwu/.scispacy/datasets/13b30cd31cd37c1b52f3df6ea023061172d16e9941660e677fdbb29489af7410.4ad71d86ce780e00cab131c7e3b81acfd2f11dd80ccd61125c8bcde506f2ab8a.umls_2017_aa_cat0129.json'

In [26]:
linker = UmlsEntityLinker(resolve_abbreviations=True)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /var/folders/fv/7k1qk5v11dn33sdcngv2wbnm0000gn/T/tmpi28r9t99
Finished download, copying /var/folders/fv/7k1qk5v11dn33sdcngv2wbnm0000gn/T/tmpi28r9t99 to cache at /Users/xiaofeiwu/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f8966c4178b892190a302b21836f.330707f4efe774134872b9f77f0e3208c1d30f50800b3b39a6b8ec21d9adf1b7.umls_semantic_type_tree.tsv


In [27]:
nlp.add_pipe(linker)

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

Name:  bulbar muscular atrophy


In [29]:
# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity._.umls_ents:
	print(' + ', linker.umls.cui_to_entity[umls_ent[0]])

 +  CUI: C1839259, Name: Bulbo-Spinal Atrophy, X-Linked
Definition: An X-linked recessive form of spinal muscular atrophy. It is due to a mutation of the gene encoding the ANDROGEN RECEPTOR.
TUI(s): T047
Aliases (abbreviated, total: 50): 
	 Bulbo-Spinal Atrophy, X-Linked, Bulbo-Spinal Atrophy, X-Linked, Atrophies, X-Linked Bulbo-Spinal, Bulbo Spinal Atrophy, X Linked, Bulbo-Spinal Atrophies, X-Linked, X-Linked Bulbo-Spinal Atrophies, Atrophy, X-Linked Bulbo-Spinal, X Linked Bulbo Spinal Atrophy, X-Linked Bulbo-Spinal Atrophy, X-Linked Bulbo-Spinal Atrophy
 +  CUI: C0026846, Name: Muscular Atrophy
Definition: Derangement in size and number of muscle fibers occurring with aging, reduction in blood supply, or following immobilization, prolonged weightlessness, malnutrition, and particularly in denervation.
TUI(s): T046
Aliases (abbreviated, total: 51): 
	 Muscular Atrophy, Muscular Atrophy, Muscular Atrophy, Muscular Atrophy, muscular atrophy, Muscular atrophy, Muscular atrophy, Muscular 