In [61]:
"""Tokenisation in Spacy"""

'Tokenisation in Spacy'

In [3]:
import spacy

In [4]:
"""loading the model """

nlp = spacy.load("en_core_web_sm")

In [5]:
doc = nlp(u'Tesla is looking at buying U.S startup for $6 million')

In [7]:
for token in doc:
    print(token)

Tesla
is
looking
at
buying
U.S
startup
for
$
6
million


In [8]:
for token in doc:
    print(token.text, token.pos_, token.dep_)
    #dep_ stands for syntatic dependency.

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [9]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1121250b8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1227f83a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1227f8408>)]

In [None]:
"""Here 'ner' stands for named entity recognition"""
"""parsing in NLP is the process of determining the syntactic structure 
of a text"""

In [12]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [17]:
doc2=  nlp(u"tesla isn't    looking into startups anymore.")

In [18]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

tesla NOUN nsubj
is VERB ROOT
n't ADV neg
    SPACE 
looking VERB attr
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [None]:
"""Here we can see that even spaces are considered as token and spacy
knows that isn't is a combination os two words is(verb, root) and 
not(ADv, neg) """

In [23]:
doc[0].pos_

'PROPN'

In [21]:
spacy.explain('PROPN')

'proper noun'

In [26]:
doc[0].dep_, doc[0].lemma_, doc[0].tag_, doc[0].is_alpha, doc[0].is_stop

('nsubj', 'Tesla', 'NNP', True, False)

In [27]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [28]:
life_quote = doc3[16:30]

In [29]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [37]:
type(life_quote)

spacy.tokens.span.Span

In [33]:
type(doc3)

spacy.tokens.doc.Doc

In [38]:
doc4 = nlp(u"This is the first sentence. This is the second sentence. This is the last sentence. ")

for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second sentence.
This is the last sentence.


In [39]:
doc4[6].is_sent_start

True

In [42]:
doc4[7].is_sent_start 
# it's gonna return None

In [43]:
"""Tokenisation"""

'Tokenisation'

In [44]:
mystring = '" we\'re moving to L.A.!"'

In [46]:
print(mystring)

" we're moving to L.A.!"


In [50]:
doc5 = nlp(mystring)

In [51]:
for token in doc5:
    print(token)

"
we
're
moving
to
L.A.
!
"


In [52]:
doc6 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")


In [53]:
for token in doc6:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [54]:
doc7 = nlp("A 5km NYC cab ride costs $10.30")

In [56]:
for t in doc7:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [58]:
len(doc7)

9

In [62]:
"""named entity recogniton"""

'named entity recogniton'

In [68]:
doc8= nlp(u"Apple to build a Hong Kong factory for $6 million")

In [69]:
for token in doc8:
    print(token.text, end= " | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [77]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(entity.label_ + " : " +str(spacy.explain(entity.label_)))
    print("\n")

Apple
ORG
ORG : Companies, agencies, institutions, etc.


Hong Kong
GPE
GPE : Countries, cities, states


$6 million
MONEY
MONEY : Monetary values, including unit




In [86]:
doc9 = nlp(u"My name is Pulkit Agrawal")


In [87]:
for chunk in doc9.noun_chunks:
    print(chunk)

My name
Pulkit Agrawal


In [89]:
for chunk in doc9.ents:
    print(chunk)
    print(chunk.label_)

Pulkit Agrawal
PERSON


In [90]:
"""Tokenisation visualised"""

'Tokenisation visualised'

In [91]:
from spacy import displacy

In [92]:
doc = nlp(u"Apple is going to build a U.K. factory got $6 million")


In [94]:
displacy.render(doc, style= "dep", jupyter = True, options = {'distance':110})

In [95]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [97]:
displacy.render(doc, style= "ent", jupyter = True)

In [100]:
doc = nlp(u'this is a sentence')
displacy.serve(doc, style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
