<a href="https://colab.research.google.com/github/siddhesh1503/NLP/blob/main/NLP_EXP_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NER using the NLTK(English Language)

In [None]:
# Install and import NLTK
!pip install -q nltk
import nltk
import re
from nltk import word_tokenize, pos_tag, ne_chunk, Tree
from nltk.corpus import names

# Download required NLTK datasets
datasets = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'names']
for dataset in datasets:
    nltk.download(dataset)

# Load first names from NLTK
first_names = set(name.lower() for name in names.words())

# Extract standard NLTK named entities
def extract_named_entities(tree):
    entities = []
    for chunk in tree:
        if isinstance(chunk, Tree):
            entity_name = " ".join([token for token, pos in chunk.leaves()])
            entity_type = chunk.label()
            entities.append((entity_name, entity_type))
    return entities

# Extract additional entities like TIME, DATE, PERCENTAGE, EVENT, GAME
def extract_additional_entities(text):
    additional_entities = []

    # Time pattern
    time_pattern = r'\b(?:[01]?\d|2[0-3]):[0-5]\d(?:\s?[APap][Mm])?\b|\b(?:[1-9]|1[0-2])\s?[APap][Mm]\b'
    times = re.findall(time_pattern, text)
    additional_entities.extend([(t, 'TIME') for t in times])

    # Date pattern
    date_pattern = r'\b(?:\d{4}-\d{2}-\d{2})\b|\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|' \
                   r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s*\d{4})?\b'
    dates = re.findall(date_pattern, text)
    additional_entities.extend([(d, 'DATE') for d in dates])

    # Percentage pattern
    percentage_pattern = r'\b\d+(?:\.\d+)?%\b'
    percentages = re.findall(percentage_pattern, text)
    additional_entities.extend([(p, 'PERCENTAGE') for p in percentages])

    # Event keywords
    events_list = ['concert', 'festival', 'conference', 'meeting', 'wedding']
    for event in events_list:
        if re.search(r'\b' + re.escape(event) + r'\b', text, re.IGNORECASE):
            additional_entities.append((event, 'EVENT'))

    # Game keywords
    games_list = ['cricket', 'football', 'chess', 'tennis', 'hockey']
    for game in games_list:
        if re.search(r'\b' + re.escape(game) + r'\b', text, re.IGNORECASE):
            additional_entities.append((game, 'GAME'))

    return additional_entities

# Correct misclassified PERSON entities
def correct_person_entities(entities):
    corrected_entities = []
    for name, label in entities:
        if label == 'GPE' and name.lower() in first_names:
            corrected_entities.append((name, 'PERSON'))
        else:
            corrected_entities.append((name, label))
    return corrected_entities

# Main NER function
def perform_ner(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ner_tree = ne_chunk(pos_tags)

    entities = extract_named_entities(ner_tree)
    additional_entities = extract_additional_entities(text)

    all_entities = entities + additional_entities
    all_entities = correct_person_entities(all_entities)

    return all_entities

# Execution
if __name__ == "__main__":
    text_en = input("Enter English text: ")
    entities = perform_ner(text_en)

    print("\n🔹 Optimized NLTK NER Results:")
    if entities:
        for entity_name, entity_label in entities:
            print(f"Entity: {entity_name}, Type: {entity_label}")
    else:
        print("No entities found.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


Enter English text: Alice will travel to Paris for a music festival on September 15 at 7 PM and will play chess there with 60% chance of rain.

🔹 Optimized NLTK NER Results:
Entity: Alice, Type: PERSON
Entity: Paris, Type: GPE
Entity: 7 PM, Type: TIME
Entity: September 15, Type: DATE
Entity: festival, Type: EVENT
Entity: chess, Type: GAME


NER using the Spacy(English Language)

In [None]:
import spacy
import re

# Download the small English model if not already present
!python -m spacy download en_core_web_sm

# Load the model
nlp = spacy.load("en_core_web_sm")

# Function to extract additional entities like events and games using regex
def extract_additional_entities(text):
    additional_entities = []

    # Event keywords example
    events_list = ['concert', 'festival', 'conference', 'meeting', 'wedding']
    for event in events_list:
        if re.search(r'\b' + re.escape(event) + r'\b', text, re.IGNORECASE):
            additional_entities.append((event, 'EVENT'))

    # Game keywords example
    games_list = ['cricket', 'football', 'chess', 'tennis', 'hockey']
    for game in games_list:
        if re.search(r'\b' + re.escape(game) + r'\b', text, re.IGNORECASE):
            additional_entities.append((game, 'GAME'))

    return additional_entities

# Main NER function using spaCy
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    additional_entities = extract_additional_entities(text)
    return entities + additional_entities

# Main execution block
if __name__ == "__main__":
    text_en = input("Enter English text: ")
    entities = perform_ner(text_en)

    print("\n🔹 spaCy NER Results:")
    if entities:
        for entity_text, entity_label in entities:
            print(f"Entity: {entity_text}, Type: {entity_label}")
    else:
        print("No entities found.")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Enter English text: John will attend a cricket match in London on August 20 at 5 PM.

🔹 spaCy NER Results:
Entity: John, Type: PERSON
Entity: London, Type: GPE
Entity: August 20, Type: DATE
Entity: 5 PM, Type: TIME
Entity: cricket, Type: GAME


NER using the Stanza(English Language)

In [None]:
# Install stanza
!pip install -q stanza

import stanza
import re
import logging

logging.getLogger('stanza').setLevel(logging.ERROR)

# Download English model (only once)
stanza.download('en')

# Initialize the pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

# Event/Game keywords
EVENTS_LIST = ['concert', 'festival', 'conference', 'meeting', 'wedding']
GAMES_LIST = ['cricket', 'football', 'chess', 'tennis', 'hockey']

# Function to extract additional entities (event/game)
def extract_additional_entities(text):
    additional_entities = []

    for event in EVENTS_LIST:
        if re.search(r'\b' + re.escape(event) + r'\b', text, re.IGNORECASE):
            additional_entities.append((event, 'EVENT'))

    for game in GAMES_LIST:
        if re.search(r'\b' + re.escape(game) + r'\b', text, re.IGNORECASE):
            additional_entities.append((game, 'GAME'))

    return additional_entities

# Main NER function
def perform_ner(text):
    doc = nlp(text)
    entities = []

    # Stanza NER extraction
    for sent in doc.sentences:
        for ent in sent.ents:
            # ent.text: entity text, ent.type: entity type
            entities.append((ent.text, ent.type))

    # Add event/game entities
    additional_entities = extract_additional_entities(text)
    all_entities = entities + additional_entities

    return all_entities

# Execution
if __name__ == "__main__":
    text_en = input("Enter English text: ")
    entities = perform_ner(text_en)

    print("\n🔹 Stanza NER Results:")
    if entities:
        for entity_name, entity_label in entities:
            print(f"Entity: {entity_name}, Type: {entity_label}")
    else:
        print("No entities found.")


Enter English text: Michael will attend the football championship in New York on October 10 at 3 PM and participate in a chess tournament.

🔹 Stanza NER Results:
Entity: Michael, Type: PERSON
Entity: New York, Type: GPE
Entity: October 10, Type: DATE
Entity: 3 PM, Type: TIME
Entity: football, Type: GAME
Entity: chess, Type: GAME


NER using the Stanza(Regional Language)

In [None]:
import stanza
import re
import logging

logging.getLogger('stanza').setLevel(logging.ERROR)  # Only show errors

# -------------------- Initialize Pipelines --------------------
nlp_hi = stanza.Pipeline(lang='hi', processors='tokenize,pos,ner', verbose=False)
nlp_mr = stanza.Pipeline(lang='mr', processors='tokenize,pos,ner', verbose=False)

# -------------------- Event/Game Keywords --------------------
EVENTS_LIST_HI = ['समारोह', 'महोत्सव', 'कॉन्फ्रेंस', 'मीटिंग', 'विवाह', 'क्रिकेट टूर्नामेंट', 'संगीत महोत्सव']
GAMES_LIST_HI = ['क्रिकेट', 'फुटबॉल', 'शतरंज', 'टेनिस', 'हॉकी']
EVENTS_LIST_MR = ['समारंभ', 'उत्सव', 'सम्मेलन', 'मीटिंग', 'लग्न', 'संगीत उत्सव', 'क्रिकेट स्पर्धा']
GAMES_LIST_MR = ['क्रिकेट', 'फुटबॉल', 'शतरंज', 'टेनिस', 'हॉकी']

# -------------------- Names Keywords --------------------
NAMES_LIST_HI = ['राहुल', 'सारा', 'अमित', 'पुनीत']
NAMES_LIST_MR = ['सारा', 'रोहित', 'अभिजीत', 'नेहा']

# -------------------- Regex Patterns --------------------
TIME_PATTERN = r'\b(?:एक|दो|तीन|चार|पाँच|छह|सात|आठ|नौ|दस|ग्यारह|बारह|चार|सहा|तीन)\s?बजे\b|' \
               r'\b(?:एक|दोन|तीन|चार|पाच|सहा|सात|आठ|नऊ|दहा|अकरा|बारा|सहा|तीन)\s?वाजता\b'
DATE_PATTERN_HI = r'\b(?:एक|दो|तीन|चार|पाँच|छह|सात|आठ|नौ|दस|ग्यारह|बारह|तेरह|चौदह|पंद्रह|सोलह|सत्रह|अठारह|उन्नीस|बीस|इक्कीस|बाईस|तेईस|चौबीस|पच्चीस|छब्बीस|सत्ताईस|अट्ठाईस|उनतीस|तीस)\s?(?:जनवरी|फरवरी|मार्च|अप्रैल|मई|जून|जुलाई|अगस्त|सितंबर|अक्टूबर|नवंबर|दिसंबर)\b'
DATE_PATTERN_MR = r'\b(?:एक|दोन|तीन|चार|पाच|सहा|सात|आठ|नऊ|दहा|अकरा|बारा|तेरा|चौदा|पंधरा|सोला|सतर|अठरा|एकोणीस|वीस|एकवीस|बावीस|तेवीस|चोवीस|पंचवीस|सव्वीस|सत्तावीस|अठ्ठावीस|एकोणतीस|तीस)\s?(?:जानेवारी|फेब्रुवारी|मार्च|एप्रिल|मे|जून|जुलै|ऑगस्ट|सप्टेंबर|ऑक्टोबर|नोव्हेंबर|डिसेंबर)\b'
PERCENT_PATTERN = r'\b(?:साठ|पंचाहत्तर|सत्तर|ऐंशी|नव्वद|पचास)\s?(?:प्रतिशत|टक्के)\b'

# -------------------- Functions --------------------
def extract_regex_entities(text, lang):
    entities = []

    # TIME
    times = re.findall(TIME_PATTERN, text)
    entities.extend([(t, 'TIME') for t in times])

    # DATE
    date_pattern = DATE_PATTERN_HI if lang == 'hi' else DATE_PATTERN_MR
    dates = re.findall(date_pattern, text)
    entities.extend([(d, 'DATE') for d in dates])

    # PERCENTAGE
    percentages = re.findall(PERCENT_PATTERN, text)
    entities.extend([(p, 'PERCENTAGE') for p in percentages])

    # EVENT / GAME keywords
    events_list = EVENTS_LIST_HI if lang == 'hi' else EVENTS_LIST_MR
    games_list = GAMES_LIST_HI if lang == 'hi' else GAMES_LIST_MR

    for event in events_list:
        if event in text:
            entities.append((event, 'EVENT'))

    for game in games_list:
        if game in text:
            entities.append((game, 'GAME'))

    return entities

def extract_names(text, lang):
    names_list = NAMES_LIST_HI if lang == 'hi' else NAMES_LIST_MR
    entities = []
    for name in names_list:
        if name in text:
            entities.append((name, 'PERSON'))
    return entities

def process_sentence(text, nlp, lang):
    doc = nlp(text)
    entities = []

    # PERSON & GPE from Stanza
    for sent in doc.sentences:
        for ent in sent.ents:
            if ent.type in ['PERSON', 'LOC', 'GPE', 'NEL']:
                entities.append((ent.text, 'PERSON' if ent.type=='PERSON' else 'GPE'))

    # Add regex-based entities
    entities += extract_regex_entities(text, lang)

    # Add names from keyword list
    entities += extract_names(text, lang)

    # Remove duplicates
    seen = set()
    final_entities = []
    for e in entities:
        if e not in seen:
            final_entities.append(e)
            seen.add(e)

    return final_entities

# -------------------- New Sentences --------------------
hindi_sentence = "राहुल मुंबई के क्रिकेट टूर्नामेंट में बीस अगस्त को चार बजे भाग लेंगे और शतरंज खेलेंगे, बारिश की पचास प्रतिशत संभावना है।"
marathi_sentence = "सारा पुण्यातील संगीत उत्सवात पंधरा सप्टेंबर रोजी सहा वाजता सहभागी होईल आणि टेनिस खेळेल, विजेतेपदाची साठ टक्के शक्यता आहे।"

# -------------------- Process Hindi --------------------
entities_hi = process_sentence(hindi_sentence, nlp_hi, 'hi')
print("\n🔹 Stanza NER Result (Hindi):")
for text, label in entities_hi:
    print(f"Entity: {text}, Type: {label}")

# -------------------- Process Marathi --------------------
entities_mr = process_sentence(marathi_sentence, nlp_mr, 'mr')
print("\n🔹 Stanza NER Result (Marathi):")
for text, label in entities_mr:
    print(f"Entity: {text}, Type: {label}")



🔹 Stanza NER Result (Hindi):
Entity: मुंबई, Type: GPE
Entity: बीस अगस्त, Type: DATE
Entity: पचास प्रतिशत, Type: PERCENTAGE
Entity: क्रिकेट टूर्नामेंट, Type: EVENT
Entity: क्रिकेट, Type: GAME
Entity: शतरंज, Type: GAME
Entity: राहुल, Type: PERSON

🔹 Stanza NER Result (Marathi):
Entity: पुण्यातील, Type: GPE
Entity: पंधरा सप्टेंबर, Type: DATE
Entity: उत्सव, Type: EVENT
Entity: संगीत उत्सव, Type: EVENT
Entity: टेनिस, Type: GAME
Entity: सारा, Type: PERSON
