# Spacy language processing pipeline

In [1]:
import spacy

2023-08-27 04:42:54.827557: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
nlp=spacy.blank('en')
doc=nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

# download trained pipeline

using this command - "python -m spacy download en_core_web_sm"

In [4]:
nlp=spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fa0981d6ce0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fa0981d7700>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fa0c9666ea0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fa0a8215200>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fa0a82158c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fa0c9666f80>)]

In [6]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token," | ",spacy.explain(token.pos_)," | ",token.lemma_)

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
samosa  |  proper noun  |  samosa
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


# Named Entity Recognition


In [7]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent,ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [8]:
from spacy import displacy

displacy.render(doc,style='ent')

# Trained processing pipeline in French


In [9]:
nlp = spacy.load("fr_core_news_sm")


In [12]:
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")

for ent in doc.ents:
    print(ent," | ",ent.label_," | ",spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [14]:
for token in doc:
    print(token," | ",token.pos_," | ",token.lemma_)

Tesla  |  PROPN  |  Tesla
Inc  |  PROPN  |  Inc
va  |  VERB  |  aller
racheter  |  VERB  |  racheter
Twitter  |  VERB  |  twitter
pour  |  ADP  |  pour
$  |  NOUN  |  dollar
45  |  NUM  |  45
milliards  |  NOUN  |  milliard
de  |  ADP  |  de
dollars  |  NOUN  |  dollar


# Adding a component to a blank pipeline


In [16]:
source_nlp = spacy.load("en_core_web_sm")
nlp=spacy.blank("en")
nlp.add_pipe("ner",source=source_nlp)

nlp.pipe_names


['ner']

In [17]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)


Tesla Inc ORG
$45 billion MONEY


https://raw.githubusercontent.com/codebasics/nlp-tutorials/800619f7ee7dafa941c09b9395903c8995df12e7/5_spacy_lang_processing_pipeline/sentecizer.jpg