In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

In [4]:
# Load a default pipe line which is empty and check the output

nlp = spacy.blank("en")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, "  |   ", token.pos_, "    |   ", token.lemma_)

Captain   |         |    
america   |         |    
ate   |         |    
100   |         |    
$   |         |    
of   |         |    
samosa   |         |    
.   |         |    
Then   |         |    
he   |         |    
said   |         |    
I   |         |    
can   |         |    
do   |         |    
this   |         |    
all   |         |    
day   |         |    
.   |         |    


In [5]:
# load a pre-trained pipeline and check output for same

nlp = spacy.load("en_core_web_sm")
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, "  |   ", token.pos_, "    |   ", token.lemma_)

Captain   |    PROPN     |    Captain
america   |    PROPN     |    america
ate   |    VERB     |    eat
100   |    NUM     |    100
$   |    NUM     |    $
of   |    ADP     |    of
samosa   |    PROPN     |    samosa
.   |    PUNCT     |    .
Then   |    ADV     |    then
he   |    PRON     |    he
said   |    VERB     |    say
I   |    PRON     |    I
can   |    AUX     |    can
do   |    VERB     |    do
this   |    PRON     |    this
all   |    DET     |    all
day   |    NOUN     |    day
.   |    PUNCT     |    .


In [6]:
nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names,'\n')
print(nlp.pipeline,'\n')
print(nlp.pipe_factories,'\n')
print(nlp.pipe_labels,'\n')
print(nlp.pipe,'\n')

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'] 

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x0000024ACF42BEF0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x0000024ACF42A870>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x0000024AD05CC430>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x0000024ACE8DC050>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x0000024AD0BC5C50>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x0000024AD05CC4A0>)] 

{'tok2vec': 'tok2vec', 'tagger': 'tagger', 'parser': 'parser', 'senter': 'senter', 'attribute_ruler': 'attribute_ruler', 'lemmatizer': 'lemmatizer', 'ner': 'ner'} 

{'tok2vec': [], 'tagger': ['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM'

# Named Entity Recognition


In [7]:
doc = nlp("Tesla Inc has acquired twitter for $44 billion")

for ent in doc.ents:
    print(ent, "    |   ", ent.label_)

Tesla Inc     |    ORG
$44 billion     |    MONEY


In [8]:
from spacy import displacy
displacy.render(doc, style="ent")

**We can also use another pre-trained pipeline, in another language**


- Run python -m spacy download fr_core_news_sm to download pre-trained pipeline package is executing for 1st time
- fr_core_news_sm is French language pre-trained pipeline

In [11]:
nlp = spacy.load("fr_core_news_sm")
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [12]:
for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
racheter  |  VERB  |  racheter
Twitter  |  VERB  |  twitter
pour  |  ADP  |  pour
$  |  NOUN  |  dollar
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


# Adding a Component to a blank pipeline

- Take a pre trained pipeline and create a blank pipeline for it
- Copy the compomentd from pre-trained pipeline to blank pipeline

In [14]:
nlp = spacy.load("en_core_web_sm")
cust_nlp = spacy.blank("en")

cust_nlp.add_pipe("ner", source=nlp)
cust_nlp.pipe_names

['ner']

In [15]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY
