In [1]:
import spacy

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 320.0 kB/s eta 0:00:40
     --------------------------------------- 0.1/12.8 MB 518.5 kB/s eta 0:00:25
     - -------------------------------------- 0.3/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 3.1 MB/s eta 0:00:04
     -- ------------------------------------- 0.9/12.8 MB 3.9 MB/s eta 0:00:04
     --- ------------------------------------ 1.2/12.8 MB 4.4 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.8 MB 4.2 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 4.1 MB/s eta 0:00:03
     ----- ---------------------------------- 1.6/12.8 MB 4.2 MB/s eta 0:00:03
     ----- ----------------------------

# Load the package:

In [4]:
nlp = spacy.load("en_core_web_sm")

# Define the sentence:

In [7]:
sent = 'Mark Zukerber will meet Aditya Joshi on Monday 6th June 2024, 10am for $3 Trillion, Mumbai. '

# apply sent to the package:
sent = nlp(sent)

# Find the NER:


In [10]:
sent.ents

(Mark Zukerber, Aditya Joshi, Monday 6th June 2024, 10am, $3 Trillion, Mumbai)

In [12]:
for ent in sent.ents:
    print(ent.text, "==> ", ent.label_)

Mark Zukerber ==>  PERSON
Aditya Joshi ==>  PERSON
Monday 6th June 2024 ==>  DATE
10am ==>  TIME
$3 Trillion ==>  MONEY
Mumbai ==>  GPE


# Another examples

In [21]:

raw_text='Alaska is the largest U.S. state by area, comprising more total area than the next three largest states of Texas, California and Montana combined, and is the seventh-largest subnational division in the world. It is the third-least populous and most sparsely populated U.S. state, but is, with a population of 736,081 as of 2020, the continents most populous territory located mostly north of the 60th parallel, with more than quadruple the combined populations of Northern Canada and Greenland'

# apply to the package:
raw_text = nlp(raw_text)

# find entities: 
raw_text.ents

for ent in raw_text.ents:
    print(ent.text, '==>', ent.label_)

Alaska ==> GPE
U.S. ==> GPE
three ==> CARDINAL
Texas ==> GPE
California ==> GPE
Montana ==> GPE
seventh ==> ORDINAL
third ==> ORDINAL
U.S. ==> GPE
736,081 ==> CARDINAL
2020 ==> DATE
60th ==> ORDINAL
Northern Canada ==> ORG
Greenland ==> GPE


# know about the labels in NER:

In [24]:
print(spacy.explain('GPE'))
print(spacy.explain('CARDINAL'))
print(spacy.explain('ORDINAL'))
print(spacy.explain('NORP'))

Countries, cities, states
Numerals that do not fall under another type
"first", "second", etc.
Nationalities or religious or political groups


# Display the NER in interactive way:

In [25]:
from spacy import displacy

displacy.render(sent, style = "ent", jupyter= True)

In [28]:
displacy.render(raw_text, style = "ent", jupyter= True)

# Example:

In [33]:

icc_text = 'The Chairman heads the board of directors and on June 26, 2014, Narayanaswami Srinivasan, the former president of BCCI, was announced as the first chairman of the council.[6] The role of ICC president became a largely honorary position after the establishment of the chairman role and other changes made to the ICC constitution in 2014. It has been claimed that the 2014 changes have handed control to the Big Three nations of England, India and Australia. The last ICC president was Zaheer Abbas, who was appointed in June 2015 following the resignation of Mustafa Kamal in April 2015. When the post of ICC president was abolished in April 2016, Shashank Manohar, who replaced Srinivasan in October 2015, became the first independent elected chairman of the ICC.'

# apply package:
icc_text = nlp(icc_text)

# Find NER:
icc_text.ents



Narayanaswami Srinivasan ==> PERSON
Zaheer Abbas ==> PERSON
Mustafa Kamal ==> PERSON
Shashank Manohar ==> PERSON


# find the entity as person only:

In [34]:
for ent in icc_text.ents:
    if ent.label_ == "PERSON":
        print(ent.text, '==>', ent.label_)

Narayanaswami Srinivasan ==> PERSON
Zaheer Abbas ==> PERSON
Mustafa Kamal ==> PERSON
Shashank Manohar ==> PERSON


# find the entity as location only

In [38]:
for ent in icc_text.ents:
    if ent.label_ == "GPE":
        print(ent.text, '==>', ent.label_ )

England ==> GPE
India ==> GPE
Australia ==> GPE


In [40]:
displacy.render(icc_text, style= "ent", jupyter=True)