# Entities

In [30]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

# Data

In [3]:
def get_abstracts():
    with open("abstracts_manual.txt") as f:
        x = f.read()
    x = x.split("\n\n")
    x = [i.replace("\n", " ") for i in x]
    return(x)

In [4]:
abstracts = get_abstracts()

len(abstracts)

18

In [10]:
# create single doc for this analysis
doc = "".join(abstracts)
doc = nlp(doc)

# Analysis

## Part of speech tagging

In [13]:
i = 99

print(doc[i].text, doc[i].pos_, doc[i].tag_, spacy.explain(doc[i].tag_))

treatment NOUN NN noun, singular or mass


In [20]:
selection = doc[100:120]

for token in selection:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

,          PUNCT    ,      punctuation mark, comma
demarcating VERB     VBG    verb, gerund or present participle
the        DET      DT     determiner
immune     ADJ      JJ     adjective (English), other noun-modifier (Chinese)
and        CCONJ    CC     conjunction, coordinating
genomic    ADJ      JJ     adjective (English), other noun-modifier (Chinese)
landscape  NOUN     NN     noun, singular or mass
of         ADP      IN     conjunction, subordinating or preposition
cancers    NOUN     NNS    noun, plural
at         ADP      IN     conjunction, subordinating or preposition
their      PRON     PRP$   pronoun, possessive
earliest   ADJ      JJS    adjective, superlative
possible   ADJ      JJ     adjective (English), other noun-modifier (Chinese)
stages     NOUN     NNS    noun, plural
will       AUX      MD     verb, modal auxiliary
be         VERB     VB     verb, base form
crucial    ADJ      JJ     adjective (English), other noun-modifier (Chinese)
to         PART     TO    

## Top POS

In [22]:
counts = doc.count_by(spacy.attrs.POS)
counts

{92: 1224,
 87: 134,
 90: 297,
 93: 28,
 85: 491,
 97: 552,
 100: 399,
 84: 555,
 89: 155,
 86: 149,
 103: 46,
 95: 48,
 94: 39,
 96: 214,
 98: 26,
 101: 4,
 99: 25,
 91: 1}

In [25]:
# top 2 POS in doc
doc.vocab[92].text, doc.vocab[84].text

('NOUN', 'ADJ')

In [29]:
# something else in spac

# Count the different dependencies:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

398. acomp: 21
399. advcl: 24
400. advmod: 144
401. agent: 5
402. amod: 565
403. appos: 66
404. attr: 18
405. aux : 95
406. auxpass: 46
407. cc  : 159
408. ccomp: 24
410. conj: 214
412. csubj: 7
414. dep : 5
415. det : 275
416. dobj: 178
417. expl: 1
423. mark: 20
425. neg : 3
426. nmod: 57
428. npadvmod: 40
429. nsubj: 168
430. nsubjpass: 46
433. oprd: 2
436. parataxis: 1
438. pcomp: 30
439. pobj: 480
440. poss: 13
442. preconj: 3
443. prep: 477
444. prt : 1
445. punct: 577
446. quantmod: 5
447. relcl: 25
450. xcomp: 23
451. acl : 37
3965108062993911700. dative: 2
7037928807040764755. compound: 349
8110129090154140942. case: 1
8206900633647566924. ROOT: 148
12837356684637874264. nummod: 32


## Visualizations

In [52]:
selection = doc[282:318]
selection

Such future directions enable novel insights into the evolution of lung cancers and, thus, can provide a low-hanging fruit of targets for early immune-based treatment of this fatal malignancy.

In [53]:
displacy.render(selection, style='dep', jupyter=True, options={'distance': 110})

In [None]:
# serve it!
# displacy.serve(doc, style='dep', options={'distance': 110})

In [72]:
selection = doc[:500].as_doc()

# Create spans from Doc.sents:
spans = list(selection.sents)

displacy.render(spans, style='dep', options={'distance': 110})

In [70]:
type(selection.as_doc())

spacy.tokens.doc.Doc