In [1]:
!pip install spacy
!pip install nltk
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Implementation of Named Entity Recognition in Python

### Analyzing the Text
It is done to preprocess and locate words and phrases that could represent entities.

### Finding Sentence Boundaries
It identifies starting and ending of sentences using punctutation and capitalization which helps in maintaining meaning of context.

### Tokenizing and Part of Speech Tagging
Text is broken into tokens (words) and each token is tagged with its grammatical role which provide important clues for identifying entities.

### Entitiy detection and classification
Tokens or groups of tokens that match patterns of known entities are recognized and classified into predefined categories like person, organization, location etc

### Model Training and refinement.
Machine Learning models are trained using labeled datasets and they improve over time by learning patterns and relationships between words.

### Adapting to the new contexts
A well trained model can generate to different languages, styles and unseen types of entities by learning from context.




In [1]:
# step 2 : importing and loading fata
import pandas as pd
import spacy
import requests
from bs4 import BeautifulSoup
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)

In [2]:
# step 3: applying NER to sample text
'''
we have created some random contents to implement this you can use any text based on your choice.
doc = nlp(content): process text stored in context using the nlp model and stores resulting document object
in the variable doc for further analysis.

for ent in doc.ents : iterates through the named entities (doc.ents)
identified in the processed document and performs action for each entity,
'''

content = "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."
doc = nlp(content)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


Trinamool Congress 0 18 ORG
Mahua Moitra 26 38 PERSON
the Supreme Court 49 66 ORG
Moitra 157 163 NORP
Parliament 184 194 ORG
last week 195 204 DATE
the Ethics Committee 211 231 ORG
Darshan Hiranandani 373 392 PERSON


In [3]:
# visualising Entities
from spacy import displacy
displacy.render(doc, style="ent")

In [4]:
# Step 5 : creating a DataFrame for Entities

entities = [(ent.text, ent.label_, ent.lemma_) for ent in doc.ents]
df = pd.DataFrame(entities, columns=['text', 'type', 'lemma'])
print(df)

                   text    type                 lemma
0    Trinamool Congress     ORG    Trinamool Congress
1          Mahua Moitra  PERSON          Mahua Moitra
2     the Supreme Court     ORG     the Supreme Court
3                Moitra    NORP                Moitra
4            Parliament     ORG            Parliament
5             last week    DATE             last week
6  the Ethics Committee     ORG  the Ethics Committee
7   Darshan Hiranandani  PERSON   Darshan Hiranandani
