In [8]:
import spacy
#https://www.youtube.com/watch?v=hKK59rfpXL0&list=PLeo1K3hjS3uuvuAXhYjV2lMEShq2UYSwX&index=9

In [9]:
nlp = spacy.blank("en")

In [10]:
doc = nlp("captain america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token)

captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [11]:
nlp.pipeline #returns [] suggesting that the pipeline is blank

[]

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
#load english language specific pipelines instead of spacy.blank("en")
nlp = spacy.load("en_core_web_sm")

In [18]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [16]:
doc = nlp("captain america ate 100$ of samosa. Then he said says saw I can do this all day.")
for token in doc:
    print(token, "|", token.pos_, "|", token.lemma_) #pos_ retrieves the token's part of speech #lemma = lemmatization of pipelines
#tagger component gives you pos_
#lemmatizer component gives you lemma_

captain | PROPN | captain
america | PROPN | america
ate | VERB | eat
100 | NUM | 100
$ | NUM | $
of | ADP | of
samosa | PROPN | samosa
. | PUNCT | .
Then | ADV | then
he | PRON | he
said | VERB | say
says | VERB | say
saw | VERB | see
I | PRON | I
can | AUX | can
do | VERB | do
this | PRON | this
all | DET | all
day | NOUN | day
. | PUNCT | .


In [50]:
#what does the ner component of the pipeline do
doc = nlp("Tesla Inc is founded by Elon Musk. Tesla is going to acquire Twitter for $45 billion")
#doc = nlp("Bloomberg founded company called Bloomberg")


for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))
    
#NER allows you to recognize entities from the text
#NER = name entity recognition

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Elon Musk  |  PERSON  |  People, including fictional
Tesla  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


In [33]:
#visual display of the entities
from spacy import displacy

displacy.render(doc, style = "ent")

In [34]:
#pipelines in different languages
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [36]:
nlp1 = spacy.load('fr_core_news_sm')

In [37]:
text = "Tesla Inc est fondée par Elon Musk. Tesla va acquérir Twitter pour 45 milliards de dollars."
doc1 = nlp1(text)
for ent in doc1.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Elon Musk  |  PER  |  Named person or family.
Tesla  |  PER  |  Named person or family.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [38]:
for token in doc1:
    print(token, "|", token.pos_, "|", token.lemma_)

Tesla | PROPN | Tesla
Inc | PROPN | Inc
est | AUX | être
fondée | VERB | fonder
par | ADP | par
Elon | PROPN | Elon
Musk | PROPN | Musk
. | PUNCT | .
Tesla | PROPN | Tesla
va | VERB | aller
acquérir | VERB | acquérir
Twitter | NOUN | twitter
pour | ADP | pour
45 | NUM | 45
milliards | NOUN | milliard
de | ADP | de
dollars | NOUN | dollar
. | PUNCT | .


In [41]:
#I only want the ner component in my pipeline nlp2
nlp1 = spacy.load('fr_core_news_sm')
nlp2 = spacy.blank("fr")
nlp2.add_pipe("ner", source = nlp1)

<spacy.pipeline.ner.EntityRecognizer at 0x29d0bef10>

In [42]:
nlp2.pipe_names

['ner']

In [48]:
text = "Tesla Inc est fondée par Elon Musk. Tesla va acquérir Twitter pour $45 milliards"
doc1 = nlp2(text)
for ent in doc1.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Elon Musk  |  PER  |  Named person or family.
Tesla  |  PER  |  Named person or family.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [49]:
nlp3 = spacy.blank("fr") #no entities in empty pipelines
text = "Tesla Inc est fondée par Elon Musk. Tesla va acquérir Twitter pour 45 milliards de dollars."
doc1 = nlp3(text)
for ent in doc1.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))