## NLP in Python

- There are two great NLP libraries in Python
    - NLTK
    - Spacy

In [5]:
# python3 -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text +  "\t\t" + token.lemma_ +  "\t\t" + token.pos_ +  "\t\t" + token.tag_ +  "\t\t" + token.dep_ +  "\t\t" + 
            token.shape_ +  "\t\t" + str(token.is_alpha) +  "\t\t" + str(token.is_stop))

Apple		Apple		PROPN		NNP		nsubj		Xxxxx		True		False
is		be		AUX		VBZ		aux		xx		True		True
looking		look		VERB		VBG		ROOT		xxxx		True		False
at		at		ADP		IN		prep		xx		True		True
buying		buy		VERB		VBG		pcomp		xxxx		True		False
U.K.		U.K.		PROPN		NNP		dobj		X.X.		False		False
startup		startup		NOUN		NN		advcl		xxxx		True		False
for		for		ADP		IN		prep		xxx		True		True
$		$		SYM		$		quantmod		$		False		False
1		1		NUM		CD		compound		d		False		False
billion		billion		NUM		CD		pobj		xxxx		True		False


## Typed Dependency Relations

- Words in a sentence have certain grammatical dependency relations

In [4]:
doc = nlp("It was incredibly easy to set up and use")

for token in doc:
    print(token.text + "\t\t" + token.dep_)

It		nsubj
was		ROOT
incredibly		advmod
easy		acomp
to		aux
set		xcomp
up		prt
and		cc
use		conj


## Named Entity Recognition (NER) in Spacy

In [148]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text +  "\t\t"  + ent.label_)

Apple		ORG
U.K.		GPE
$1 billion		MONEY


In [130]:
# python3 -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")

ez_sentence = "I like to work on NLP projects."
doc = nlp(ez_sentence)

print("token\t\thead\t\tchildren")
for token in doc:
    print(token.text + "\t\t" + token.head.text + "\t\t", list(token.children))

token		head		children
I		like		 []
like		like		 [I, work, .]
to		work		 []
work		like		 [to, on]
on		work		 [projects]
NLP		projects		 []
projects		on		 [NLP]
.		like		 []


In [46]:
print(doc.to_json())

{'text': 'I like to work on NLP projects.', 'ents': [{'start': 18, 'end': 21, 'label': 'ORG'}], 'sents': [{'start': 0, 'end': 31}], 'tokens': [{'id': 0, 'start': 0, 'end': 1, 'tag': 'PRP', 'pos': 'PRON', 'morph': 'Case=Nom|Number=Sing|Person=1|PronType=Prs', 'lemma': 'I', 'dep': 'nsubj', 'head': 1}, {'id': 1, 'start': 2, 'end': 6, 'tag': 'VBP', 'pos': 'VERB', 'morph': 'Tense=Pres|VerbForm=Fin', 'lemma': 'like', 'dep': 'ROOT', 'head': 1}, {'id': 2, 'start': 7, 'end': 9, 'tag': 'TO', 'pos': 'PART', 'morph': '', 'lemma': 'to', 'dep': 'aux', 'head': 3}, {'id': 3, 'start': 10, 'end': 14, 'tag': 'VB', 'pos': 'VERB', 'morph': 'VerbForm=Inf', 'lemma': 'work', 'dep': 'xcomp', 'head': 1}, {'id': 4, 'start': 15, 'end': 17, 'tag': 'IN', 'pos': 'ADP', 'morph': '', 'lemma': 'on', 'dep': 'prep', 'head': 3}, {'id': 5, 'start': 18, 'end': 21, 'tag': 'NNP', 'pos': 'PROPN', 'morph': 'NounType=Prop|Number=Sing', 'lemma': 'NLP', 'dep': 'compound', 'head': 6}, {'id': 6, 'start': 22, 'end': 30, 'tag': 'NNS',

In [50]:
print("token\t\tlemma")
for token in doc:
    print(token.text + "\t\t" + token.lemma_+ "\t\t" + token.pos_)

token		lemma
I		I		PRON
like		like		VERB
to		to		PART
work		work		VERB
on		on		ADP
NLP		NLP		PROPN
projects		project		NOUN
.		.		PUNCT


In [47]:
print("token\t\thead\t\tdependency")
for token in doc:
    print(token.text + "\t\t" + token.head.text + "\t\t", token.dep_)

token		head		dependency
I		like		 nsubj
like		like		 ROOT
to		work		 aux
work		like		 xcomp
on		work		 prep
NLP		projects		 compound
projects		on		 pobj
.		like		 punct


## Dependency Relations Visualization

In [34]:
# from spacy import displacy

# displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [21/Jun/2021 10:45:40] "GET / HTTP/1.1" 200 5885
127.0.0.1 - - [21/Jun/2021 10:45:40] "GET /favicon.ico HTTP/1.1" 200 5885


Shutting down server on port 5000.


In [52]:
list(map(str, doc.sents))

['I like to work on NLP projects.']

In [56]:
spacy.explain("PART")

'particle'

In [57]:
spacy.explain("ADP")

'adposition'

In [60]:
spacy.explain("PROPN")

'proper noun'

In [61]:
spacy.explain("PRON")

'pronoun'

In [62]:
spacy.explain('nsubj')

'nominal subject'

In [63]:
spacy.explain('xcomp')

'open clausal complement'

## Activity: Obtain all nouns in a given doc

In [76]:
running_sentence = "Use some of our test sentences; Joey's not very smart, nor charming."
doc = nlp(running_sentence)

for token in doc:
    if token.pos_ == "NOUN":
        print(token.text)

test
sentences


## Activity: Obtain all adjectives in a given doc

In [78]:
running_sentence = "Use some of our test sentences; Joey's not very smart, nor charming."
doc = nlp(running_sentence)

for token in doc:
    if token.pos_ == "ADJ":
        print(token.text)

smart
charming


## Compare two docs similarities

In [64]:
doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")
doc1.similarity(doc2)

  This is separate from the ipykernel package so we can avoid doing imports until


0.925736944702134

In [66]:
print(doc1[0])
print(doc2[0])

I
I


In [65]:
doc1[0].similarity(doc2[0])

1.0

In [71]:
doc1[0].vector

array([ 0.7234746 ,  0.38154346,  0.66175425,  0.7835334 , -0.08129972,
        0.8285754 ,  0.30813095, -0.6761173 , -0.48376414, -0.4713223 ,
       -0.7022138 , -0.2862827 ,  0.8272269 ,  0.30487955, -0.02827258,
       -0.35199   ,  2.1878786 , -0.5114093 ,  1.4417213 ,  0.02294272,
        1.0511227 ,  1.9509596 , -0.53758466,  1.0180392 ,  1.7537245 ,
        0.5439365 , -0.12415107, -0.7842591 ,  0.11994658, -0.1651625 ,
        1.185293  , -0.80969834,  0.09500918,  0.8644767 ,  0.8298837 ,
       -0.34343302, -0.57643485, -0.08808553,  1.0751209 , -0.8416685 ,
       -0.8024647 , -0.6332845 ,  0.3363381 ,  0.57827204,  0.17947425,
        0.05619171,  0.03974978, -0.3117528 ,  0.21147938, -0.1747675 ,
       -0.5122521 , -1.0206125 , -0.8476414 , -0.5519779 , -0.13225076,
       -0.98096585, -0.28991407, -0.3955323 ,  0.31543267, -0.141415  ,
       -0.4099917 , -0.7666892 , -0.6018261 , -0.3410501 , -0.997912  ,
       -0.5727545 , -1.1035743 , -0.3857379 , -0.55139315, -0.17

In [72]:
len(doc1[0].vector)

96

In [75]:
(doc1[0].vector + doc1[1].vector + doc1[2].vector)/3

array([ 0.65953344,  0.20457071, -0.2168029 , -0.0663239 , -0.02564881,
        1.1735222 ,  0.5488158 , -0.6949029 , -0.4826446 , -0.4281449 ,
       -0.53467286, -0.17537081,  0.01190096, -0.21929957,  0.40536675,
        0.16490836,  1.2217817 , -0.95645934,  0.10238484,  0.15352367,
        0.35413232,  0.17164588, -0.3990942 ,  1.0606269 ,  0.35469282,
        0.276727  ,  0.30499485, -0.46575412,  0.3559837 ,  0.38793695,
       -0.45595416, -0.5019067 , -0.28663364,  0.70636016,  0.04505293,
        0.00191317, -0.20185769, -0.14363174,  0.65396357,  0.23927905,
       -0.09493562, -0.39411843, -0.41607657,  0.7398642 ,  0.9191839 ,
       -0.16872291,  0.4406608 , -0.11798272,  0.10440584,  0.02230643,
       -0.64892036, -0.2050776 ,  0.06356782, -0.09158478,  0.85268706,
       -0.9675676 ,  0.1344917 , -0.5549603 ,  0.51177484,  0.12765697,
       -0.34897205, -0.9882657 ,  0.03326197, -0.24359477, -0.21692814,
        0.27825445, -0.37806988, -0.73021287, -0.54079616, -0.23

In [74]:
doc1.vector

array([ 0.65953344,  0.20457071, -0.2168029 , -0.0663239 , -0.02564881,
        1.1735222 ,  0.5488158 , -0.6949029 , -0.4826446 , -0.4281449 ,
       -0.53467286, -0.17537081,  0.01190096, -0.21929957,  0.40536675,
        0.16490836,  1.2217817 , -0.95645934,  0.10238484,  0.15352367,
        0.35413232,  0.17164588, -0.3990942 ,  1.0606269 ,  0.35469282,
        0.276727  ,  0.30499485, -0.46575412,  0.3559837 ,  0.38793695,
       -0.45595416, -0.5019067 , -0.28663364,  0.70636016,  0.04505293,
        0.00191317, -0.20185769, -0.14363174,  0.65396357,  0.23927905,
       -0.09493562, -0.39411843, -0.41607657,  0.7398642 ,  0.9191839 ,
       -0.16872291,  0.4406608 , -0.11798272,  0.10440584,  0.02230643,
       -0.64892036, -0.2050776 ,  0.06356782, -0.09158478,  0.85268706,
       -0.9675676 ,  0.1344917 , -0.5549603 ,  0.51177484,  0.12765697,
       -0.34897205, -0.9882657 ,  0.03326197, -0.24359477, -0.21692814,
        0.27825445, -0.37806988, -0.73021287, -0.54079616, -0.23

## Stemming and lemmatization

- https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library

# References:

- https://github.com/explosion/spaCy/blob/master/spacy/glossary.py