# Entities etc 

With `scispacy`

In [10]:
import spacy
import pandas as pd

# Data

In [11]:
def get_abstracts():
    with open("abstracts_manual.txt") as f:
        x = f.read()
    x = x.split("\n\n")
    x = [i.replace("\n", " ") for i in x]
    return(x)

In [12]:
abstracts = get_abstracts()

len(abstracts)

18

In [13]:
# create single doc for this analysis
text = "".join(abstracts)
text = text

In [14]:
text2 = "Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily."

# ! Available labels

In [15]:
nlp = spacy.load("en_ner_bionlp13cg_md")

In [16]:
# 1. show available pipes
nlp.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

In [17]:
# 2. show available labels
nlp.get_pipe('ner').labels

('AMINO_ACID',
 'ANATOMICAL_SYSTEM',
 'CANCER',
 'CELL',
 'CELLULAR_COMPONENT',
 'DEVELOPING_ANATOMICAL_STRUCTURE',
 'GENE_OR_GENE_PRODUCT',
 'IMMATERIAL_ANATOMICAL_ENTITY',
 'MULTI_TISSUE_STRUCTURE',
 'ORGAN',
 'ORGANISM',
 'ORGANISM_SUBDIVISION',
 'ORGANISM_SUBSTANCE',
 'PATHOLOGICAL_FORMATION',
 'SIMPLE_CHEMICAL',
 'TISSUE')

# Abbreviations

In [20]:
from scispacy.abbreviation import AbbreviationDetector

In [21]:
nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp(text)

In [22]:
TOP = 20

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations[:TOP]:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
PD-1 	 (365, 366) programmed cell death 1
NSCLC 	 (396, 397) non-small cell lung cancer
NSCLC 	 (2580, 2581) non-small cell lung cancer
NSCLC 	 (3720, 3721) non-small cell lung cancer
NSCLC 	 (1326, 1327) non-small cell lung cancer
NSCLC 	 (2587, 2588) non-small cell lung cancer
NSCLC 	 (383, 384) non-small cell lung cancer
NSCLC 	 (3729, 3730) non-small cell lung cancer
NSCLC 	 (939, 940) non-small cell lung cancer
NSCLC 	 (1268, 1269) non-small cell lung cancer
NSCLC 	 (1301, 1302) non-small cell lung cancer
NSCLC 	 (414, 415) non-small cell lung cancer
MoAbs 	 (894, 895) Monoclonal antibodies
MoAbs 	 (917, 918) Monoclonal antibodies
Treg 	 (1031, 1032) T regulatory cells
ICIs 	 (1911, 1912) immune checkpoint inhibitors
ICIs 	 (1063, 1064) immune checkpoint inhibitors
ICIs 	 (1413, 1414) immune checkpoint inhibitors
ICIs 	 (1944, 1945) immune checkpoint inhibitors
ICIs 	 (1289, 1290) immune checkpoint inhibitors


In [23]:
# print only unique abbrevs
# need str() method
abbrevs = dict()

for abrv in doc._.abbreviations:
    abbrevs[str(abrv)] = str(abrv._.long_form)

In [24]:
pd.Series(abbrevs).sort_index()

ADC                                          adenocarcinomas
ATM/ATR       ataxia–telangiectasia mutated and Rad3-related
BALF                            bronchoalveolar lavage fluid
DDR                                      DNA damage response
DDRs                                    DNA damage responses
DNA-PKcs     DNA-dependent protein kinase, catalytic subunit
EGFR                   epidermal growth factor receptor gene
EGFR TKIs                    EGFR tyrosine kinase inhibitors
EMT                     epithelial to mesenchymal transition
GEMMs                    genetically engineered mouse models
ICIs                            immune checkpoint inhibitors
MoAbs                                  Monoclonal antibodies
NSCLC                             non-small cell lung cancer
PD-1                                 programmed cell death 1
PDXs                              patient derived xenografts
SCC                                  squamous cell carcinoma
SCLC                    

# Entity Linker (NEED DOWNLOAD)

https://github.com/allenai/scispacy#example-usage-1

In [25]:
#from scispacy.linking import EntityLinker

# Genes and other special entities

In [68]:
nlp = spacy.load("en_ner_bionlp13cg_md")

doc = nlp(text)

In [89]:
# single
x = doc.ents[2]

x, x.label_

(advanced-stage, 'CANCER')

In [93]:
# all special entities

attrs = ["text", "label_", "start", "end", "start_char", "end_char"]
data = [
    [str(getattr(ent, attr)) for attr in attrs]
    for ent in doc.ents
]
df = pd.DataFrame(data, columns=attrs)

In [94]:
df

Unnamed: 0,text,label_,start,end,start_char,end_char
0,Lung cancer,CANCER,0,2,0,11
1,malignancy,CANCER,12,13,66,76
2,advanced-stage,CANCER,25,26,154,168
3,lung cancer patients,CANCER,50,53,319,339
4,undauntedly,GENE_OR_GENE_PRODUCT,60,61,372,383
...,...,...,...,...,...,...
409,273H,GENE_OR_GENE_PRODUCT,4085,4086,25469,25473
410,human lung cancer,ORGANISM,4089,4092,25479,25496
411,mice,ORGANISM,4102,4103,25552,25556
412,mouse,ORGANISM,4105,4106,25562,25567


In [98]:
df[df.label_.isin(["CANCER"])]

Unnamed: 0,text,label_,start,end,start_char,end_char
0,Lung cancer,CANCER,0,2,0,11
1,malignancy,CANCER,12,13,66,76
2,advanced-stage,CANCER,25,26,154,168
3,lung cancer patients,CANCER,50,53,319,339
5,cancer,CANCER,71,72,442,448
...,...,...,...,...,...,...
402,lung cancer,CANCER,4018,4020,25087,25098
404,lung adenocarcinomas,CANCER,4034,4036,25177,25197
406,lung cancer,CANCER,4059,4061,25312,25323
407,anti-cancer,CANCER,4069,4070,25374,25385


# Genes

In [54]:
nlp = spacy.load("en_ner_bionlp13cg_md")

In [63]:
doc = nlp(text)

In [56]:
for ent in doc.ents:
    print(ent)
    print(ent.label_)

bulbar muscular
PATHOLOGICAL_FORMATION
neuron
CELL
androgen receptor
GENE_OR_GENE_PRODUCT
AR
GENE_OR_GENE_PRODUCT


In [41]:
x = doc.ents[2]

In [57]:
print(nlp.get_pipe('ner').labels)

('AMINO_ACID', 'ANATOMICAL_SYSTEM', 'CANCER', 'CELL', 'CELLULAR_COMPONENT', 'DEVELOPING_ANATOMICAL_STRUCTURE', 'GENE_OR_GENE_PRODUCT', 'IMMATERIAL_ANATOMICAL_ENTITY', 'MULTI_TISSUE_STRUCTURE', 'ORGAN', 'ORGANISM', 'ORGANISM_SUBDIVISION', 'ORGANISM_SUBSTANCE', 'PATHOLOGICAL_FORMATION', 'SIMPLE_CHEMICAL', 'TISSUE')


In [66]:
def get_genes(doc, nlp):
    genes = dict()
    for i, ent in enumerate(doc.ents):
        if ent.label_ == "GENE_OR_GENE_PRODUCT":
            genes[i] = ent.label_
    return(genes)

In [67]:
get_genes(doc, nlp)

{'undauntedly': 'GENE_OR_GENE_PRODUCT',
 '“tumor-type': 'GENE_OR_GENE_PRODUCT',
 '”': 'GENE_OR_GENE_PRODUCT',
 'naïve': 'GENE_OR_GENE_PRODUCT',
 'immunogenomic-based': 'GENE_OR_GENE_PRODUCT',
 'PD-1': 'GENE_OR_GENE_PRODUCT',
 'STAT3': 'GENE_OR_GENE_PRODUCT',
 'EGFR': 'GENE_OR_GENE_PRODUCT',
 'FOXO3a': 'GENE_OR_GENE_PRODUCT',
 'TGF-β': 'GENE_OR_GENE_PRODUCT',
 'COX-2': 'GENE_OR_GENE_PRODUCT',
 'Bcl-2': 'GENE_OR_GENE_PRODUCT',
 'PI3KAkt/mTOR': 'GENE_OR_GENE_PRODUCT',
 'Fas/FasL': 'GENE_OR_GENE_PRODUCT',
 'Cdc42': 'GENE_OR_GENE_PRODUCT',
 'E-cadherin': 'GENE_OR_GENE_PRODUCT',
 'MMPs': 'GENE_OR_GENE_PRODUCT',
 'adiponectin': 'GENE_OR_GENE_PRODUCT',
 'anti-PD-1/PD-L1': 'GENE_OR_GENE_PRODUCT',
 'PDLIM2': 'GENE_OR_GENE_PRODUCT',
 'NF-κB/RelA': 'GENE_OR_GENE_PRODUCT',
 'PDLIM2-independent PD-L1': 'GENE_OR_GENE_PRODUCT',
 'PD1': 'GENE_OR_GENE_PRODUCT',
 'CTLA4': 'GENE_OR_GENE_PRODUCT',
 'PD1/PDL1': 'GENE_OR_GENE_PRODUCT',
 'pembrolizumab': 'GENE_OR_GENE_PRODUCT',
 'durvalumab': 'GENE_OR_GENE_PR

In [65]:
text

"Lung cancer is the number one cause of cancer-related deaths. The malignancy is characterized by dismal prognosis and poor clinical outcome mostly due to advanced-stage at diagnosis, thereby inflicting a heavy burden on public health worldwide. Recent breakthroughs in immunotherapy have greatly benefited a subset  of lung cancer patients, and more importantly, they are undauntedly bringing forth a paradigm shift in the drugs approved for cancer treatment, by introducing “tumor-type agnostic therapies”. Yet, and to fulfill immunotherapy's potential of personalized cancer treatment, demarcating the immune and genomic landscape of cancers at their earliest possible stages will be crucial to identify ideal targets for early treatment and to predict how a particular patient will fare with immunotherapy. Recent genomic surveys of premalignant lung cancer have shed  light on early alterations in the evolution of lung cancer. More recently, the advent of immunogenomic technologies has provide

# Part of speech tagging

In [6]:
selection = doc[100:120]

for token in selection:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

,          PUNCT    ,      punctuation mark, comma
demarcating VERB     VBG    verb, gerund or present participle
the        DET      DT     determiner
immune     ADJ      JJ     adjective (English), other noun-modifier (Chinese)
and        CCONJ    CC     conjunction, coordinating
genomic    ADJ      JJ     adjective (English), other noun-modifier (Chinese)
landscape  NOUN     NN     noun, singular or mass
of         ADP      IN     conjunction, subordinating or preposition
cancers    NOUN     NNS    noun, plural
at         ADP      IN     conjunction, subordinating or preposition
their      PRON     PRP$   pronoun, possessive
earliest   ADJ      JJS    adjective, superlative
possible   ADJ      JJ     adjective (English), other noun-modifier (Chinese)
stages     NOUN     NNS    noun, plural
will       AUX      MD     verb, modal auxiliary
be         VERB     VB     verb, base form
crucial    ADJ      JJ     adjective (English), other noun-modifier (Chinese)
to         PART     TO    

## Top POS

In [7]:
counts = doc.count_by(spacy.attrs.POS)
counts

{92: 1224,
 87: 134,
 90: 297,
 93: 28,
 85: 491,
 97: 552,
 100: 399,
 84: 555,
 89: 155,
 86: 149,
 103: 46,
 95: 48,
 94: 39,
 96: 214,
 98: 26,
 101: 4,
 99: 25,
 91: 1}

In [8]:
# top 2 POS in doc
doc.vocab[92].text, doc.vocab[84].text

('NOUN', 'ADJ')

In [9]:
# something else in spac

# Count the different dependencies:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

398. acomp: 21
399. advcl: 24
400. advmod: 144
401. agent: 5
402. amod: 565
403. appos: 66
404. attr: 18
405. aux : 95
406. auxpass: 46
407. cc  : 159
408. ccomp: 24
410. conj: 214
412. csubj: 7
414. dep : 5
415. det : 275
416. dobj: 178
417. expl: 1
423. mark: 20
425. neg : 3
426. nmod: 57
428. npadvmod: 40
429. nsubj: 168
430. nsubjpass: 46
433. oprd: 2
436. parataxis: 1
438. pcomp: 30
439. pobj: 480
440. poss: 13
442. preconj: 3
443. prep: 477
444. prt : 1
445. punct: 577
446. quantmod: 5
447. relcl: 25
450. xcomp: 23
451. acl : 37
3965108062993911700. dative: 2
7037928807040764755. compound: 349
8110129090154140942. case: 1
8206900633647566924. ROOT: 148
12837356684637874264. nummod: 32


## Visualizations

In [10]:
selection = doc[282:318]
selection

Such future directions enable novel insights into the evolution of lung cancers and, thus, can provide a low-hanging fruit of targets for early immune-based treatment of this fatal malignancy.

In [11]:
displacy.render(selection, style='dep', jupyter=True, options={'distance': 110})

# Abbreviations