In [1]:
!pip install scispacy spacy

Collecting scispacy
  Downloading scispacy-0.3.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 4.1 MB/s eta 0:00:011
Collecting pysbd
  Downloading pysbd-0.3.3-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.0 MB/s eta 0:00:011
Collecting nmslib>=1.7.3.6
  Downloading nmslib-2.0.6-cp38-cp38-macosx_10_14_x86_64.whl (907 kB)
[K     |████████████████████████████████| 907 kB 1.6 MB/s eta 0:00:01
Collecting pybind11>=2.2.3
  Downloading pybind11-2.6.0-py2.py3-none-any.whl (187 kB)
[K     |████████████████████████████████| 187 kB 8.7 MB/s eta 0:00:01
[?25hInstalling collected packages: pysbd, pybind11, nmslib, scispacy
Successfully installed nmslib-2.0.6 pybind11-2.6.0 pysbd-0.3.3 scispacy-0.3.0


In [5]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz (33.1 MB)
[K     |████████████████████████████████| 33.1 MB 5.5 MB/s eta 0:00:011
Building wheels for collected packages: en-core-sci-sm
  Building wheel for en-core-sci-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-sci-sm: filename=en_core_sci_sm-0.2.5-py3-none-any.whl size=33155835 sha256=472a084a2c3aefc31cd55d049233e1f117e3d64e51ba28024bdddb4cdb901704
  Stored in directory: /Users/sdeshpande/Library/Caches/pip/wheels/b8/31/95/e58e692aba89fb29e03f4cc865fd7999b48d8775b7be37bfd4
Successfully built en-core-sci-sm
Installing collected packages: en-core-sci-sm
  Attempting uninstall: en-core-sci-sm
    Found existing installation: en-core-sci-sm 0.2.0
    Uninstalling en-core-sci-sm-0.2.0:
      Successfully uninstalled en-core-sci-sm-0.2.0
Successfully instal

In [6]:
import scispacy
import spacy
import en_core_sci_sm
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker

#Load the model
nlp = en_core_sci_sm.load()
# nlp = spacy.load("en")

In [7]:
text = """
Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity. They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC).
"""
doc = nlp(text)


#Split the text into sentences.

print(list(doc.sents))

[
Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity., They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC).
]


In [8]:
#Print the Medical or Clinical entities present in the input text
print(doc.ents)

(MDSC, myeloid cells, immunosuppressive activity, accumulate, tumor-bearing mice, humans, cancer, hepatocellular carcinoma, HCC)


In [9]:
# visualise dependency parses
displacy.render(next(doc.sents), style='dep', jupyter=True)

In [10]:
# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

In [11]:
doc = nlp(text)

In [12]:
#Print the Abbreviation and it's definition
print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
MDSC 	 (6, 7) Myeloid derived suppressor cells
HCC 	 (33, 34) hepatocellular carcinoma


In [13]:
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmph10a41kv
Finished download, copying /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmph10a41kv to cache at /Users/sdeshpande/.scispacy/datasets/e9f7327283e43f0482f7c0c71b71dec278a58ccb3ffdd03c2c2350159e7ef146.f2a350ad19015b2591545f7feeed6a6d6d2fffcd635d868a5d7fc0dfc3cadfd8.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/nmslib_index.bin not found in cache, downloading to /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmprxnhy3ox
Finished download, copying /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmprxnhy3ox to cache at /Users/sdeshpande/.scispacy/datasets/f48455d6c79262057cce66b4619123c2b558b21092d42fac97f47bb99a5b8f9f.dd70d3dffe7d90d7ac8914460e16a48375dab32485fb6313a34e6fbcaf53218b.nmslib_index.bin
https://s3-us-west-2.amazonaws



https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/concept_aliases.json not found in cache, downloading to /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmp6aj407_b
Finished download, copying /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmp6aj407_b to cache at /Users/sdeshpande/.scispacy/datasets/1428ec15d3b1061731ea273c03699130b3d6b90948993e74bda66af605ff8e2a.aeb7a686c654df6bccb6c2c23d3eda3eb381daaefda4592b58158d0bee53b352.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl not found in cache, downloading to /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmpqlzyizs_
Finished download, copying /var/folders/z4/zq1sx9z918l822zf41yhflvr0000gn/T/tmpqlzyizs_ to cache at /Users/sdeshpande/.scispacy/datasets/4d7fb8fcae1035d1e0a47d9072b43d5a628057d35497fbfb2499b4b7b2dd4dd7.05ec7eef12f336d4666da85b7fa69b9401883a7dd4244473f7b88b413ccbba03.umls_2020_aa_cat0129.jsonl
https://s3-us-west-2.amazon

In [14]:
doc = nlp(text)

In [15]:
# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

Name:  myeloid cells


In [16]:
# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for umls_ent in entity._.umls_ents:
    print(linker.umls.cui_to_entity[umls_ent[0]])

CUI: C0887899, Name: Myeloid Cells
Definition: The classes of BONE MARROW-derived blood cells in the monocytic series (MONOCYTES and their precursors) and granulocytic series (GRANULOCYTES and their precursors).
TUI(s): T025
Aliases: (total: 6): 
	 Cells, Myeloid, Myeloid Cells, myeloid cell, myeloid cells, Myeloid Cell, Cell, Myeloid
CUI: C0596993, Name: Myeloid Progenitor Cells
Definition: A hematopoietic stem cell found in the bone marrow that is committed to form erythrocytes, megakaryocytes, and all leukocytes except lymphocytes.
TUI(s): T025
Aliases (abbreviated, total: 17): 
	 Bone Marrow Myeloid Stem Cell, Cell, Myeloid Stem, Cells, Myeloid Stem, stem cells myeloid, Myeloid Stem Cells, Myeloid Progenitor Cell, Myeloid cell, Progenitor Cells, Myeloid, Stem Cell, Myeloid, Myeloid Progenitor Cells
CUI: C4321406, Name: Myeloid Progenitor Cell Count
Definition: The determination of the number of myeloid progenitor cells in a sample.
TUI(s): T059
Aliases: (total: 3): 
	 MYPC, Myeloid

In [17]:
doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

In [18]:
#Find the entities and it's definition

entity = doc.ents
# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
for i in range(len(entity)):
    for umls_ent in entity[i]._.umls_ents:
        print(linker.umls.cui_to_entity[umls_ent[0]])

CUI: C0521329, Name: Spinal
Definition: Of or relating to the spine or spinal cord.
TUI(s): T082
Aliases: (total: 2): 
	 Spinal, spinal
CUI: C0037922, Name: Spinal Canal
Definition: The cavity within the SPINAL COLUMN through which the SPINAL CORD passes.
TUI(s): T030
Aliases (abbreviated, total: 15): 
	 vertebral canal, canal spinal, Vertebral Canal, Spinal canal, NOS, Vertebral canal, NOS, neural canal, Spinal Canals, Canal, Spinal, Spinal canal structure, Spinal canal
CUI: C3887662, Name: Intraspinal Neoplasm
Definition: A primary or metastatic neoplasm that occurs within the spinal canal including the spinal cord and surrounding paraspinal spaces.
TUI(s): T191
Aliases (abbreviated, total: 16): 
	 Spinal Canal Tumors, neoplasm spinal, Neoplasms of the Spinal Canal and Spinal Cord, Spinal Neoplasms, Tumor of the Spinal Canal and Spinal Cord, Neoplasms of Spinal Canal and Spinal Cord, Tumor of Spinal Canal and Spinal Cord, Neoplasm of the Spinal Canal and Spinal Cord, Spinal Tumors, I

In [19]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Spinal 0 6 ENTITY
bulbar muscular atrophy 11 34 ENTITY
SBMA 36 40 ENTITY
inherited 59 68 ENTITY
motor neuron disease 69 89 ENTITY
expansion 104 113 ENTITY
polyglutamine tract 130 149 ENTITY
androgen receptor 161 178 ENTITY
AR 180 182 ENTITY
SBMA 196 200 ENTITY


In [20]:
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
