In [12]:
import spacy

In [13]:
nlp = spacy.load('en_core_web_sm')

In [14]:
# 查看默认的pipeline包含哪些组件
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x21cd6feb9a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x21cdb18bd10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x21cdb16a760>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x21cdb1c4740>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x21cdb1f73c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x21cdb16a940>)]

In [15]:
text = 'Apple is looking for buying a UK startup for $1 billion'

## 解析文本，输出doc

In [16]:
doc = nlp(text)

## tokenizer

In [17]:
for token in doc:
    print(token.text)

Apple
is
looking
for
buying
a
UK
startup
for
$
1
billion


## POS

In [18]:
for token in doc:
    print(f'{token.text:{15}} {token.pos_:}')

Apple           PROPN
is              AUX
looking         VERB
for             ADP
buying          VERB
a               DET
UK              PROPN
startup         NOUN
for             ADP
$               SYM
1               NUM
billion         NUM


## Dependency

In [19]:
from spacy import displacy

In [27]:
displacy.render(doc,style='dep')

In [21]:
displacy.render(doc,style='dep',options={'distance':150,'compact':True})

## NER

In [83]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
UK GPE
$1 billion MONEY


In [84]:
displacy.render(doc,style='ent')

## Lemmatizer

In [90]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_}')

Apple           Apple
is              be
looking         look
for             for
buying          buy
a               a
UK              UK
startup         startup
for             for
$               $
1               1
billion         billion


## sentence segmentation

In [91]:
text = 'Apple is looking for buying a UK startup. Government has given permission'

In [92]:
doc=nlp(text)

In [93]:
for sent in doc.sents:
    print(sent)

Apple is looking for buying a UK startup.
Government has given permission


## spacy pipeline技巧

In [1]:
texts = ['net income was $9.4 million compared to the prior year of 2.7$ million',
        'revenue exceeds twelve billion dollars with a loss of $1b']

In [7]:
docs = nlp.pipe(texts, disable = ['tagger', 'parser','lemmatizer'])

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)
    print()

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

