Tokenization

In [1]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

In [7]:
doc = nlp(mystring)

In [8]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [9]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [11]:
for token in doc2:
    print(token.text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [12]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for token in doc3:
    print(token.text)

A
5
km
NYC
cab
ride
costs
$
10.30


In [16]:
# Exceptions
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for token in doc4:
    print(token.text)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [17]:
# To count number of tokens
len(doc4)

11

In [18]:
doc4.vocab

<spacy.vocab.Vocab at 0x241f4ffa7c8>

In [19]:
len(doc4.vocab)

511

In [20]:
# Indexing and Slicing to retrieve tokens
doc5 = nlp(u'It is better to give than to receive.')
doc5[0]

It

In [21]:
doc5[2:5]

better to give

In [22]:
# Cant edit the tokens in the doc
doc5[0] = "text"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [23]:
# Entities
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [24]:
for token in doc8:
    print(token.text,end = ' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [27]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_))) # Explain the labels
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




## Noun Chunks ==== Noun and the words describing that Noun

In [28]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

# Noun Chunks
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


## Tokenization Vizualization

In [29]:
from spacy import displacy

In [33]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc,style='dep',jupyter = True,options = {'distance' : 50}) 

The optional 'distance' argument sets the distance between tokens. If the distance is made too small, text that appears beneath short arrows may become too compressed to read.

In [34]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc,style='ent',jupyter = True,options = {'distance' : 50}) 