In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm') #load the language library

In [4]:
doc = nlp(u'Tesla is looking to buy U.S. startup for $6 million.')

In [6]:
for token in doc:
    print(token.text, token.pos) #pos = part of speach code

Tesla 96
is 87
looking 100
to 94
buy 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93
. 97


In [7]:
for token in doc:
    print(token.text, token.pos_) #add _ to get the word form of the code

Tesla PROPN
is AUX
looking VERB
to PART
buy VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM
. PUNCT


In [9]:
for token in doc:
    print(token.text, token.pos_, token.dep_) #dep_ syntactic dependency

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
to PART aux
buy VERB xcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj
. PUNCT punct


In [11]:
nlp.pipeline #nlp is the processing pipeline; ner = named entity recognizer

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11c4cef60>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x11c629348>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x11c6293a8>)]

In [12]:
nlp.pipe_names

['tagger', 'parser', 'ner']

The first step in processing text is to split up all the component parts (words & punctuation) into "tokens". These tokens are annotated inside the Doc object to contain descriptive information

In [15]:
doc2 = nlp(u"Tesla isn't      looking into startups anymore.") 

Notice how isn't has been split into two tokens. spaCy recognizes both the root verb is and the negation attached to it. 

In [16]:
for token in doc2:
    print(token.text, token.pos_, token.dep_) #space becomes a token, #for unknown reason Tesla now ADV and nominal subject

Tesla ADV nsubj
is AUX ROOT
n't PART neg
      SPACE 
looking VERB xcomp
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [41]:
spacy.explain('ADV')

'adverb'

In [46]:
print(doc2[4].text)
print(doc2[4].lemma_)

looking
look


In [47]:
# Simple Parts-of-Speech & Detailed Tags:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

VERB
VBG / verb, gerund or present participle


In [48]:
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

True
False


In [21]:
doc2[0:2] #using indexing to grab token text, counts white space

Tesla is

In [26]:
doc2[0].pos_ #only one word at a time

'ADV'

In [27]:
doc2[1].pos_ #blanks are labeled aux

'AUX'

In [28]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [32]:
life_quote = doc3[16:30]

In [33]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [34]:
type(life_quote) #span = a portion of a larger document

spacy.tokens.span.Span

In [35]:
type(doc3) #an entire document

spacy.tokens.doc.Doc

In [36]:
doc4 = nlp(u"This is the first sentence. This is the second sentence. This is the last sentence.")

In [37]:
for sentence in doc4.sents: # sents = grab sentence, not token
    print(sentence)

This is the first sentence.
This is the second sentence.
This is the last sentence.


In [38]:
doc4[6].is_sent_start #is word the start of a sentence. Doesn't return false. Doesn't return anything

True