In [1]:
# Import spaCy and load the language library
import spacy

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
spacy.__version__

'3.0.5'

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [5]:
doc.text

'Tesla is looking at buying U.S. startup for $6 million'

In [6]:
for i in doc:
    print(i.text, i.pos, i.pos_)

Tesla 96 PROPN
is 87 AUX
looking 100 VERB
at 85 ADP
buying 100 VERB
U.S. 96 PROPN
startup 92 NOUN
for 85 ADP
$ 99 SYM
6 93 NUM
million 93 NUM


In [7]:
# Print each token separately
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)
    

# .pos: part of speach number
# .pos_: part of speach name
# .dep_: syntactic dependencies 

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj


In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x23ee1ee0348>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x23ee1ee0108>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x23ee1d58f98>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x23ee1d58c18>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x23ee1ed6b08>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x23ee1d75248>)]

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


## Tokenization

In [10]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE nsubj
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [11]:
a = "Tesla isn't   looking into startups anymore."

In [12]:
a.split()

['Tesla', "isn't", 'looking', 'into', 'startups', 'anymore.']

In [13]:
doc2

Tesla isn't   looking into startups anymore.

In [14]:
doc2[0]

Tesla

In [15]:
type(doc2)

spacy.tokens.doc.Doc

___
## Part-of-Speech Tagging (POS)
For a full list of POS Tags visit https://spacy.io/api/annotation#pos-tagging

In [16]:
doc2[0].pos_

'PROPN'


## Dependencies

For a full list of Syntactic Dependencies visit https://spacy.io/api/annotation#dependency-parsing


In [17]:
doc2[0].dep_

'nsubj'

To see the full name of a tag use `spacy.explain(tag)`

In [18]:
spacy.explain('PROPN')

'proper noun'

In [19]:
spacy.explain('nsubj')

'nominal subject'

___
## Additional Token Attributes


|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [20]:
# Lemmas (the base form of the word):
print(doc2[4].text)
print(doc2[4].lemma_)

looking
look


In [21]:
# Simple Parts-of-Speech & Detailed Tags:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))

VERB
VBG / verb, gerund or present participle


In [22]:
# Word Shapes:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

Tesla: Xxxxx
U.S. : X.X.


In [23]:
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)

True
False



## Spans


In [24]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [25]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [26]:
type(life_quote)

spacy.tokens.span.Span

In [27]:
type(doc3)

spacy.tokens.doc.Doc


## Sentences


In [28]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [29]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [30]:
doc4[6]

This

In [31]:
doc4[6].is_sent_start

True

In [32]:
doc4[7]

is

In [33]:
doc4[7].is_sent_start

False