⊕ [Linguistic Features · spaCy Usage Documentation](https://spacy.io/usage/linguistic-features#section-named-entities)


In [14]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [4]:
# doc = nlp("The whole city is a startup: Shenzhen is the Silicon Valley for hardware companies")
doc = nlp("Shenzhen is the Silicon Valley for hardware companies")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Shenzhen 0 8 GPE
the Silicon Valley 12 30 LOC


In [15]:
# doc = nlp("Rami Eid is studying at Stony Brook University in NY")
doc=nlp("Gates enrolled at Harvard University in the fall of 1973")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Harvard University 18 36 ORG
the fall of 1973 40 56 DATE


## IOB SCHEME
I – Token is inside an entity.
O – Token is outside an entity.
B – Token is the beginning of an entity.

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # [u'San', u'B', u'GPE']
print(ent_francisco)  # [u'Francisco', u'I', u'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


## Built-in entity types
TIP: UNDERSTANDING ENTITY TYPES
You can also use spacy.explain() to get the description for the string representation of an entity label. For example, spacy.explain("LANGUAGE") will return "any named language".
Models trained on the OntoNotes 5 corpus support the following entity types:
+ https://spacy.io/usage/linguistic-features#entity-types

In [6]:
import spacy
from spacy.lang.de.examples import sentences

nlp = spacy.load('de_core_news_sm')
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen
Die DET nk
ganze ADJ nk
Stadt NOUN sb
ist AUX ROOT
ein DET nk
Startup NOUN pd
: PUNCT punct
Shenzhen PROPN pd
ist AUX ROOT
das DET nk
Silicon PROPN pnc
Valley PROPN sb
für ADP mnr
Hardware-Firmen NOUN nk


In [7]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Shenzhen 33 41 LOC
Silicon Valley 50 64 LOC


In [10]:
import spacy
from spacy.lang.de.examples import sentences

nlp = spacy.load('de_core_news_sm')
doc = nlp(u'Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Shenzhen 33 41 LOC
Silicon Valley 50 64 LOC


In [12]:
doc = nlp(u'Shenzhen ist das Silicon Valley für Hardware-Firmen')
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Silicon Valley 17 31 LOC


In [5]:
import spacy
from spacy.lang.fr.examples import sentences

nlp = spacy.load('fr_core_news_sm')
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple cherche a acheter une startup anglaise pour 1 milliard de dollard
Apple ADJ nsubj
cherche NOUN amod
a AUX aux
acheter VERB ROOT
une DET det
startup ADJ obj
anglaise NOUN amod
pour ADP case
1 NUM nummod
milliard NOUN obl
de ADP case
dollard NOUN nmod


In [6]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
