In [1]:
import spacy
nlp = spacy.load('en_core_web_sm') # Load the language library or # Loading a Model

In [2]:
# Create a doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [5]:
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x21c54738898>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x21c55d52168>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x21c55d521c8>)]

In [8]:
# Tokenization ==== First step
doc2 = nlp(u"Tesla isn'   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
isn NOUN ROOT
' PUNCT punct
   SPACE 
looking VERB acl
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [10]:
doc2[0].pos_

'PROPN'

In [11]:
doc2[0].dep_ # Synctatical Dependency

'nsubj'

In [12]:
# Additional Token Characters
doc2[0].is_alpha

True

In [13]:
doc2[0].tag_

'NNP'

In [14]:
doc2[0].shape_

'Xxxxx'

In [15]:
doc2[0].is_stop # Is token part of stop word list?

False

In [16]:
doc2[0].lemma_ # Base form of word

'Tesla'

SPAN IS A SLICE OF THE DOC OBJECT

Large Doc objects can be hard to work with at times. A span is a slice of Doc object in the form Doc[start:stop].

In [17]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [18]:
life_quote = doc3[16:30]

In [19]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [22]:
type(life_quote) # Check span/doc

spacy.tokens.span.Span

In [23]:
type(doc3)

spacy.tokens.doc.Doc

Sentences

Certain tokens inside a Doc object may also receive a "start of sentence" tag. While this doesn't immediately build
a list of sentences, these tags enable the generation of sentence segments through Doc.sents. 
Later we'll write our own segmentation rules.

In [24]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [35]:
# Spacy will seperate each sentence within a document
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [36]:
doc4[6]

This

In [37]:
doc4[6].is_sent_start

True

In [38]:
doc4[7]

is

In [39]:
doc4[7].is_sent_start # Doesn't return anything if sentence doesn't start with ths word