In [3]:
#we can create a Doc object using its constructor explicitly
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab(), words=[u'Hi', u'there'])
print(doc)

Hi there 


In [11]:
#Iterating over a Token’s Syntactic Children
import spacy
nlp = spacy.load('en')
doc = nlp(u'I want a green apple.')
[w for w in doc[4].children]


[a, green]

In [13]:
#Separate a text into its individual sentences
doc = nlp(u'A severe storm hit the beach. It started to rain.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[A, severe, storm, hit, the, beach, .]
[It, started, to, rain, .]


In [14]:
for i,sent in enumerate(doc.sents):
    if i==1 and sent[0].pos_== 'PRON':
        print('The second sentence begins with a pronoun.')

The second sentence begins with a pronoun.


In [17]:
counter = 0
for sent in doc.sents:
    if sent[len(sent)-2].pos_ == 'VERB':
        counter+=1
print(counter)

1


In [18]:
#The doc.noun_chunks Container
doc = nlp(u'A noun chunk is a phrase that has a noun as its head.')
for chunk in doc.noun_chunks:
    print(chunk)

A noun chunk
a phrase
a noun
its head


In [33]:
#Alternative
for token in doc:
    if token.pos_=='NOUN':
        chunk = ''
        for w in token.lefts:
            #if w.pos_ == 'DET' or w.pos_ == 'ADJ':
            chunk += w.text + ' '
        chunk += token.text
        print(chunk)

A noun chunk
a phrase
a noun
its head


In [34]:
#The Span Object
doc=nlp('I want a green apple.')
doc[2:5]

a green apple

In [42]:
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Francisco.')
[doc[i] for i in range(len(doc))]

[The, Golden, Gate, Bridge, is, an, iconic, landmark, in, San, Francisco, .]

In [43]:
#Merge
span = doc[1:4]
lem_id = doc.vocab.strings[span.text]
span.merge(lemma = lem_id)

  span.merge(lemma = lem_id)
  span.merge(lemma = lem_id)


Golden Gate Bridge

In [46]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)



The the DET det
Golden Gate Bridge Golden Gate Bridge PROPN nsubj
is be AUX ROOT
an an DET det
iconic iconic ADJ amod
landmark landmark NOUN attr
in in ADP prep
San Francisco San Francisco PROPN pobj
. . PUNCT punct


In [45]:
#Merge SF
span = doc[7:9]
lem_id = doc.vocab.strings[span.text]
span.merge(lemma = lem_id)

  span.merge(lemma = lem_id)
  span.merge(lemma = lem_id)


San Francisco

In [47]:
#CUSTOMIZING THE TEXT-PROCESSING PIPELINE
nlp.pipe_names

['tagger', 'parser', 'ner']

In [48]:
#Disabling Pipeline Components
nlp = spacy.load('en', disable=['parser'])

In [49]:
doc = nlp(u'I want a green apple.')
for token in doc:
    print(token.text, token.pos_, token.dep_)

I PRON 
want VERB 
a DET 
green ADJ 
apple NOUN 
. PUNCT 


In [84]:
nlp = spacy.load('en')
print(nlp.meta['lang'] + '_' + nlp.meta['name'])

en_core_web_sm


In [85]:
from spacy import util
util.get_package_path('en_core_web_sm')

PosixPath('/home/user/.local/lib/python3.8/site-packages/en_core_web_sm')

In [86]:
print(nlp.meta['lang'] + '_' + nlp.meta['name'] + '-' + nlp.meta['version'])

en_core_web_sm-2.3.1


In [92]:
nlp.meta['pipeline']

['tagger', 'parser', 'ner']

In [91]:
#Customizing the Pipeline Components
doc = nlp(u'I need a taxi to Festy.')
for ent in doc.ents:
    print(ent.text, ent.label_)

Festy ORG


In [64]:
#For simplicity, this training set contains just two training samples
LABEL = 'DISTRICT'
TRAIN_DATA = [
('We need to deliver it to Festy.', {
    'entities': [(25, 30, 'DISTRICT')]
  }),
('I like red oranges', {
'entities': []
  })
]

In [68]:
ner = nlp.get_pipe('ner')
ner.add_label(LABEL)
nlp.disable_pipes('tagger')
nlp.disable_pipes('parser')

ValueError: [E001] No component 'tagger' found in pipeline. Available names: ['ner']

In [69]:
#start traning
optimizer = nlp.entity.create_optimizer()
import random

for i in range(25):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)

In [70]:
doc = nlp(u'I need a taxi to Festy.')
for ent in doc.ents:
    print(ent.text, ent.label_) 

Festy DISTRICT


In [77]:
#write the trained model to disk
ner.to_disk('ner/')

In [89]:
#read the trained model from disk
ner.from_disk('ner/')

<spacy.pipeline.pipes.EntityRecognizer at 0x7fc727e5d3a0>

In [93]:
#Test
import spacy
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en', disable=['ner'])
ner = EntityRecognizer(nlp.vocab)
ner.from_disk('ner/')
nlp.add_pipe(ner)
doc = nlp(u'We need to deliver it to Festy.')
for ent in doc.ents:
    print(ent.text, ent.label_)

Festy DISTRICT


In [94]:
#the end of the chapter describes Cython and C ++ functions to speed up your work
#spacytext.px->setup.py->build and enjoy!)

In [95]:
from spacytext import main

In [106]:
import spacy
nlp = spacy.load('en')
f = open("test.txt","rb")
contents =f.read()
doc = nlp(contents[:1000000].decode('utf8'))
main(doc)

196229
1017


In [105]:
#Profit!