# Ngram any Language

In [1]:
# Install necessary libraries
!pip install nltk

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Download NLTK data (you may need to run this only once)
nltk.download('punkt')

def create_ngram_model(sentences, n):
    """
    Create an n-gram model from a list of sentences.

    Args:
        sentences (list of str): The sentences to use for the n-gram model.
        n (int): The value of n for the n-gram model.

    Returns:
        ngram_model (dict): The n-gram model as a dictionary.
    """
    ngram_model = defaultdict(lambda: defaultdict(lambda: 0))

    for sentence in sentences:
        tokens = word_tokenize(sentence)  # No need to lower case for Hindi/Marathi/Gujarati
        for ngram in ngrams(tokens, n):
            prefix, suffix = tuple(ngram[:-1]), ngram[-1]
            ngram_model[prefix][suffix] += 1

    # Convert counts to probabilities
    for prefix in ngram_model:
        total_count = float(sum(ngram_model[prefix].values()))
        for suffix in ngram_model[prefix]:
            ngram_model[prefix][suffix] /= total_count

    return ngram_model

def predict_next_word(ngram_model, sentence, n):
    """
    Predict the next word given a sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to predict the next word.
        n (int): The value of n for the n-gram model.

    Returns:
        next_word (str): The most probable next word.
    """
    tokens = word_tokenize(sentence)

    if len(tokens) < n - 1:
        return "Not enough context to predict."

    prefix = tuple(tokens[-(n - 1):])

    if prefix not in ngram_model:
        return "No prediction available for this context."

    next_word = max(ngram_model[prefix], key=ngram_model[prefix].get)
    return next_word

def calculate_probability(ngram_model, sentence, n):
    """
    Calculate the probability of a given sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to calculate the probability.
        n (int): The value of n for the n-gram model.

    Returns:
        probability (float): The probability of the sentence.
    """
    tokens = word_tokenize(sentence)
    probability = 1.0

    if len(tokens) < n:
        return 0.0

    for i in range(n - 1, len(tokens)):
        prefix = tuple(tokens[i - (n - 1):i])
        suffix = tokens[i]

        if prefix in ngram_model and suffix in ngram_model[prefix]:
            probability *= ngram_model[prefix][suffix]
        else:
            return 0.0

    return probability

def main():
    # Get user input
    n = int(input("Enter the value of n for the n-gram model: "))
    language = input("Enter the language (Hindi, Gujarati, Marathi): ").lower()

    if language == "hindi":
        sentences = input("Enter the Hindi sentences for the corpus (separated by ';'): ").split(';')
    elif language == "gujarati":
        sentences = input("Enter the Gujarati sentences for the corpus (separated by ';'): ").split(';')
    elif language == "marathi":
        sentences = input("Enter the Marathi sentences for the corpus (separated by ';'): ").split(';')
    else:
        print("Unsupported language.")
        return

    test_sentence = input("Enter the sentence to predict the next word: ")

    # Create n-gram model
    ngram_model = create_ngram_model(sentences, n)

    # Calculate probability of a test sentence
    probability = calculate_probability(ngram_model, test_sentence, n)
    print(f"Probability of '{test_sentence}': {probability}")

    # Predict the next word
    next_word = predict_next_word(ngram_model, test_sentence, n)
    print(f"Predicted next word for '{test_sentence}': {next_word}")

if __name__ == "__main__":
    main()





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91636\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unsupported language.


# Hindi Experiments

In [2]:
# Install necessary libraries
!pip install indic-nlp-library
!pip install stanza

# Import libraries
from indicnlp.tokenize import indic_tokenize
import stanza

# Download the Stanza model for Hindi
stanza.download('hi')

# Define stopwords based on the given text
custom_hindi_stopwords = [
    "में", "की", "का", "और", "है", "को", "के", "से", "हैं", "यह", "ने", "जिसमें", "किए",
    "जिन", "जो", "का", "की", "यदि", "का", "पर", "अब", "कुछ", "तक", "लोगों", "का", "कोई", "इनमें"
]

# Define the Hindi text for processing
text_for_processing = """
2010 में, नासा की ग्रह रक्षा टीम ने पृथ्वी के पास 1 किमी चौड़े 90 प्रतिशत क्षुद्रग्रहों की पहचान की थी और उन्हें लॉग किया था। ये 'पृथ्वी के निकट की वस्तुएं' या NEO, पहाड़ों के आकार की हैं और इनमें पृथ्वी की कक्षा के 50 मिलियन किलोमीटर के भीतर की कोई भी चीज़ शामिल है। लॉग इन करने के लिए अनुमानित 50 शेष होने पर, नासा का कहना है कि जिन 887 के बारे में वह जानता है उनमें से कोई भी ग्रह के लिए महत्वपूर्ण खतरा नहीं है।
अब नासा कुछ छोटे क्षुद्रग्रहों को लॉग करने की दिशा में काम कर रहा है, जिनकी चौड़ाई 140 मीटर या उससे अधिक है। इस आकार के 25,000 अनुमानित क्षुद्रग्रहों में से, अब तक लगभग 8,000 को लॉग किया जा चुका है, 17,000 को छोड़ दिया गया है। यह देखते हुए कि 2013 में रूस के चेल्याबिंस्क शहर के ऊपर 19 मीटर के क्षुद्रग्रह में विस्फोट हुआ था, जिसमें 1,200 लोग घायल हो गए थे, ये मध्यम आकार के क्षुद्रग्रह यदि पृथ्वी की कक्षा में प्रवेश करते हैं तो एक गंभीर खतरा होंगे।
"""

# Tokenization using Indic NLP for Hindi
tokens = indic_tokenize.trivial_tokenize(text_for_processing, lang='hi')
print("Tokens:", tokens)

# Stanza Pipeline for Hindi
nlp_hindi = stanza.Pipeline('hi')
doc = nlp_hindi(text_for_processing)
lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
print("Lemmas:", lemmas)

# Filter out custom stopwords
filtered_tokens = [token for token in tokens if token not in custom_hindi_stopwords]
print("Filtered Tokens:", filtered_tokens)


Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.1-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-8.1.3-py3-none-any.whl.metadata (6.4 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.21.2-py3-none-any.whl.metadata (2.8 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sphinxcontrib-applehelp>=1.0.7 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library)
  Downloading

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.4.3 requires jedi<0.19.0,>=0.17.2, but you have jedi 0.19.1 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:51:11 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json
2024-11-05 23:51:11 INFO: Downloading default packages for language: hi (Hindi) ...
2024-11-05 23:51:12 INFO: File exists: C:\Users\91636\stanza_resources\hi\default.zip
2024-11-05 23:51:16 INFO: Finished downloading models and saved to C:\Users\91636\stanza_resources
2024-11-05 23:51:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Tokens: ['\n2010', 'में', ',', 'नासा', 'की', 'ग्रह', 'रक्षा', 'टीम', 'ने', 'पृथ्वी', 'के', 'पास', '1', 'किमी', 'चौड़े', '90', 'प्रतिशत', 'क्षुद्रग्रहों', 'की', 'पहचान', 'की', 'थी', 'और', 'उन्हें', 'लॉग', 'किया', 'था', '।', 'ये', "'", 'पृथ्वी', 'के', 'निकट', 'की', 'वस्तुएं', "'", 'या', 'NEO', ',', 'पहाड़ों', 'के', 'आकार', 'की', 'हैं', 'और', 'इनमें', 'पृथ्वी', 'की', 'कक्षा', 'के', '50', 'मिलियन', 'किलोमीटर', 'के', 'भीतर', 'की', 'कोई', 'भी', 'चीज़', 'शामिल', 'है', '।', 'लॉग', 'इन', 'करने', 'के', 'लिए', 'अनुमानित', '50', 'शेष', 'होने', 'पर', ',', 'नासा', 'का', 'कहना', 'है', 'कि', 'जिन', '887', 'के', 'बारे', 'में', 'वह', 'जानता', 'है', 'उनमें', 'से', 'कोई', 'भी', 'ग्रह', 'के', 'लिए', 'महत्वपूर्ण', 'खतरा', 'नहीं', 'है', '।', '\nअब', 'नासा', 'कुछ', 'छोटे', 'क्षुद्रग्रहों', 'को', 'लॉग', 'करने', 'की', 'दिशा', 'में', 'काम', 'कर', 'रहा', 'है', ',', 'जिनकी', 'चौड़ाई', '140', 'मीटर', 'या', 'उससे', 'अधिक', 'है', '।', 'इस', 'आकार', 'के', '25,000', 'अनुमानित', 'क्षुद्रग्रहों', 'में', 'से', ',', 'अब', 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:51:17 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json
2024-11-05 23:51:18 INFO: Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

2024-11-05 23:51:18 INFO: Using device: cpu
2024-11-05 23:51:18 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:51:21 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:51:21 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:51:21 INFO: Loading: depparse
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:51:22 INFO: Done loading processors!


Lemmas: ['2010', 'में', ',', 'नासा', 'का', 'ग्रह', 'रक्षा', 'टीम', 'ने', 'पृथ्वी', 'का', 'पास', '1', 'किमी', 'चौड़ा', '90', 'प्रतिशत', 'क्षुद्रग्रह', 'का', 'पहचान', 'करना', 'था', 'और', 'वह', 'लॉग', 'करना', 'था', '।', 'यह', "'", 'पृथ्वी', 'का', 'निकट', 'का', 'वस्तु', "'", 'या', 'NEO', ',', 'पहाड', 'का', 'आकार', 'का', 'है', 'और', 'यह', 'पृथ्वी', 'का', 'कक्षा', 'का', '50', 'मिलियन', 'किलोमीटर', 'का', 'भीतर', 'का', 'कोई', 'भी', 'चीज़', 'शामिल', 'है', '।', 'लॉग', 'यह', 'करना', 'का', 'लिए', 'अनुमानित', '50', 'शेष', 'होना', 'पर', ',', 'नासा', 'का', 'कहना', 'है', 'कि', 'जो', '887', 'का', 'बारे', 'में', 'वह', 'जानना', 'है', 'वह', 'से', 'कोई', 'भी', 'ग्रह', 'का', 'लिए', 'महत्वपूर्ण', 'खतरा', 'नहीं', 'है', '।', 'अब', 'नासा', 'कुछ', 'छोटा', 'क्षुद्रग्रह', 'को', 'लॉग', 'करना', 'का', 'दिशा', 'में', 'काम', 'करना', 'रहना', 'है', ',', 'जो', 'चौड़ाई', '140', 'मीटर', 'या', 'वह', 'अधिक', 'है', '।', 'यह', 'आकार', 'का', '25000', 'अनुमानित', 'क्षुद्रग्रह', 'में', 'से', ',', 'अब', 'तक', 'लगभग', '8000', 'को', 

In [4]:
# Import necessary libraries
from indicnlp.tokenize import indic_tokenize
import stanza
from nltk import Tree
from nltk.stem import SnowballStemmer

# Download necessary resources
stanza.download('hi')

# Define the Hindi text
text_for_processing = """
2010 में, नासा की ग्रह रक्षा टीम ने पृथ्वी के पास 1 किमी चौड़े 90 प्रतिशत क्षुद्रग्रहों की पहचान की थी और उन्हें लॉग किया था। ये 'पृथ्वी के निकट की वस्तुएं' या NEO, पहाड़ों के आकार की हैं और इनमें पृथ्वी की कक्षा के 50 मिलियन किलोमीटर के भीतर की कोई भी चीज़ शामिल है। लॉग इन करने के लिए अनुमानित 50 शेष होने पर, नासा का कहना है कि जिन 887 के बारे में वह जानता है उनमें से कोई भी ग्रह के लिए महत्वपूर्ण खतरा नहीं है।
अब नासा कुछ छोटे क्षुद्रग्रहों को लॉग करने की दिशा में काम कर रहा है, जिनकी चौड़ाई 140 मीटर या उससे अधिक है। इस आकार के 25,000 अनुमानित क्षुद्रग्रहों में से, अब तक लगभग 8,000 को लॉग किया जा चुका है, 17,000 को छोड़ दिया गया है। यह देखते हुए कि 2013 में रूस के चेल्याबिंस्क शहर के ऊपर 19 मीटर के क्षुद्रग्रह में विस्फोट हुआ था, जिसमें 1,200 लोग घायल हो गए थे, ये मध्यम आकार के क्षुद्रग्रह यदि पृथ्वी की कक्षा में प्रवेश करते हैं तो एक गंभीर खतरा होंगे।
"""

# Initialize Stanza Pipeline for Hindi
nlp_hindi = stanza.Pipeline('hi')

# Process the text using Stanza
doc = nlp_hindi(text_for_processing)

# 1. Morphological Analysis
print("Morphological Analysis:")
for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}, Lemma: {word.lemma}, POS: {word.pos}, Morphology: {word.feats}")

# 2. POS Tagging (Rule-Based)
print("\nPOS Tagging (Stanza + Rule-Based):")
for sentence in doc.sentences:
    for word in sentence.words:
        pos_tag = word.pos
        # Basic rule-based adjustments if needed
        if word.text in ["हैं", "है"]:  # Example rule: recognize "हैं" and "है" as auxiliary verbs
            pos_tag = "AUX"
        print(f"Word: {word.text}, POS Tag: {pos_tag}")

# 3. Chunking (Noun and Verb Phrases)
print("\nChunking (Noun and Verb Phrases):")
chunked_sentences = []
for sentence in doc.sentences:
    chunked_sentence = []
    current_chunk = []
    for word in sentence.words:
        if word.pos in ["NOUN", "PROPN"]:  # Noun Phrase
            current_chunk.append(word.text)
        elif current_chunk:
            chunked_sentence.append(("NP", " ".join(current_chunk)))
            current_chunk = []
        if word.pos in ["VERB", "AUX"]:  # Verb Phrase
            chunked_sentence.append(("VP", word.text))
    if current_chunk:  # Append any remaining noun phrase
        chunked_sentence.append(("NP", " ".join(current_chunk)))
    chunked_sentences.append(chunked_sentence)

# Display chunked sentences
for i, chunks in enumerate(chunked_sentences):
    print(f"Sentence {i+1} Chunks:")
    for chunk_type, chunk_text in chunks:
        print(f"  {chunk_type}: {chunk_text}")

# 4. Named Entity Recognition (NER)
import re

# Predefined lists of known entities specific to the text
KNOWN_LOCATIONS = ["पृथ्वी", "चेल्याबिंस्क", "रूस"]
KNOWN_ORGANIZATIONS = ["नासा"]
KNOWN_ENTITY_TYPES = ["क्षुद्रग्रह", "ग्रह", "कक्षा", "वस्तुएं"]


# Rule-based NER function
def rule_based_ner(text_for_processing):
    entities = []

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text_for_processing:
            entities.append({
                'entity': location,
                'type': 'LOCATION'
            })

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text_for_processing:
            entities.append({
                'entity': org,
                'type': 'ORGANIZATION'
            })

    # Match other domain-specific entity types
    for entity_type in KNOWN_ENTITY_TYPES:
        if entity_type in text_for_processing:
            entities.append({
                'entity': entity_type,
                'type': 'DOMAIN_ENTITY'
            })

    # Regex pattern for detecting years and large numbers (e.g., 2010, 50 million, 25,000)
    years = re.findall(r'\b\d{4}\b', text_for_processing)
    for year in years:
        entities.append({
            'entity': year,
            'type': 'DATE'
        })

    # Regex for large numbers (used in astronomy context, such as distances or quantities)
    large_numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text_for_processing)
    for number in large_numbers:
        entities.append({
            'entity': number,
            'type': 'QUANTITY'
        })

    return entities

# Perform rule-based NER
ner_results = rule_based_ner(text_for_processing)

# Print recognized entities
print("Recognized Entities:")
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:52:37 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json
2024-11-05 23:52:37 INFO: Downloading default packages for language: hi (Hindi) ...
2024-11-05 23:52:38 INFO: File exists: C:\Users\91636\stanza_resources\hi\default.zip
2024-11-05 23:52:42 INFO: Finished downloading models and saved to C:\Users\91636\stanza_resources
2024-11-05 23:52:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:52:43 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json
2024-11-05 23:52:44 INFO: Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

2024-11-05 23:52:44 INFO: Using device: cpu
2024-11-05 23:52:44 INFO: Loading: tokenize
2024-11-05 23:52:44 INFO: Loading: pos
2024-11-05 23:52:45 INFO: Loading: lemma
2024-11-05 23:52:45 INFO: Loading: depparse
2024-11-05 23:52:46 INFO: Done loading processors!


Morphological Analysis:
Word: 2010, Lemma: 2010, POS: PROPN, Morphology: Case=Acc|Gender=Masc|Number=Sing|Person=3
Word: में, Lemma: में, POS: ADP, Morphology: AdpType=Post
Word: ,, Lemma: ,, POS: PUNCT, Morphology: None
Word: नासा, Lemma: नासा, POS: PROPN, Morphology: Case=Acc|Gender=Masc|Number=Sing|Person=3
Word: की, Lemma: का, POS: ADP, Morphology: AdpType=Post|Case=Acc|Gender=Fem|Number=Sing
Word: ग्रह, Lemma: ग्रह, POS: PROPN, Morphology: Case=Nom|Gender=Masc|Number=Sing|Person=3
Word: रक्षा, Lemma: रक्षा, POS: PROPN, Morphology: Case=Nom|Gender=Fem|Number=Sing|Person=3
Word: टीम, Lemma: टीम, POS: PROPN, Morphology: Case=Acc|Gender=Fem|Number=Sing|Person=3
Word: ने, Lemma: ने, POS: ADP, Morphology: AdpType=Post
Word: पृथ्वी, Lemma: पृथ्वी, POS: PROPN, Morphology: Case=Acc|Gender=Fem|Number=Sing|Person=3
Word: के, Lemma: का, POS: ADP, Morphology: AdpType=Post
Word: पास, Lemma: पास, POS: ADP, Morphology: AdpType=Post|Case=Nom|Gender=Masc|Number=Sing|Person=3
Word: 1, Lemma: 1, POS:

# Marathi Experiments

In [5]:
# Install necessary libraries (uncomment these lines to install in your environment)
# !pip install indic-nlp-library
# !pip install stanza

# Import libraries
from indicnlp.tokenize import indic_tokenize
import stanza

# Download the Stanza model for Marathi (uncomment if not downloaded)
# stanza.download('mr')

# Define stopwords for Marathi directly in the code
marathi_stopwords = [
    "आहे", "आहेत", "होते", "होता", "असे", "मध्ये", "आणि", "किंवा", "तर", "पण",
    "हे", "ही", "का", "नाही", "काय", "मी", "आम्ही", "तुम्ही", "तो", "ती", "ते",
    "यांचा", "यांची", "त्यांचा", "त्यांची", "यात", "त्यात", "मध्ये", "म्हणून",
    "ने", "च्या", "ह्या", "की", "कोणतीही", "ना", "जर", "यामध्ये", "येत्या", "वर",
    "इतके", "पासून", "सोबत", "यासाठी", "या"
]

# Define the NASA-related Marathi text
nasa_marathi_text = """
2010 मध्ये, नासाच्या ग्रह संरक्षण टीमने पृथ्वीच्या जवळ 1 किमी रुंद 90 टक्के क्षुद्रग्रहांची ओळख करून त्यांची नोंद केली.
हे 'पृथ्वीच्या जवळच्या वस्तू' किंवा NEO म्हणून ओळखले जातात, ज्यांचे आकार पर्वतांप्रमाणे आहेत आणि यात पृथ्वीच्या कक्षेत 50 दशलक्ष किलोमीटरच्या आत असलेली कोणतीही वस्तू समाविष्ट आहे.
नोंद करण्यासाठी अंदाजे 50 उरले असल्यास, नासाचे म्हणणे आहे की त्याला माहित असलेल्या 887 मध्ये कोणताही ग्रहासाठी महत्त्वाचा धोका नाही.
आता नासा काही लहान क्षुद्रग्रहांची नोंद करण्यासाठी काम करत आहे, ज्याची रुंदी 140 मीटर किंवा त्याहून अधिक आहे.
या आकाराच्या 25,000 क्षुद्रग्रहांपैकी, आतापर्यंत सुमारे 8,000 नोंदवले गेले आहेत, तर 17,000 टाकले गेले आहेत.
2013 मध्ये रशियाच्या चेल्याबिंस्क शहराच्या वर 19 मीटरच्या क्षुद्रग्रहाचा विस्फोट झाला होता, ज्यात 1,200 लोक जखमी झाले होते, त्यामुळे हे मध्यम आकाराचे क्षुद्रग्रह पृथ्वीच्या कक्षेत प्रवेश केल्यास गंभीर धोका बनू शकतात.
"""

# Tokenization using Indic NLP for Marathi
nasa_marathi_tokens = indic_tokenize.trivial_tokenize(nasa_marathi_text, lang='mr')
print("NASA Marathi Tokens:", nasa_marathi_tokens)

# Stanza Pipeline for Marathi
nlp_marathi = stanza.Pipeline('mr')
doc_nasa_marathi = nlp_marathi(nasa_marathi_text)
nasa_marathi_lemmas = [word.lemma for sent in doc_nasa_marathi.sentences for word in sent.words]
print("NASA Marathi Lemmas:", nasa_marathi_lemmas)

# Filter out Marathi stopwords
filtered_nasa_marathi_tokens = [token for token in nasa_marathi_tokens if token not in marathi_stopwords]
print("Filtered NASA Marathi Tokens:", filtered_nasa_marathi_tokens)


2024-11-05 23:52:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


NASA Marathi Tokens: ['\n2010', 'मध्ये', ',', 'नासाच्या', 'ग्रह', 'संरक्षण', 'टीमने', 'पृथ्वीच्या', 'जवळ', '1', 'किमी', 'रुंद', '90', 'टक्के', 'क्षुद्रग्रहांची', 'ओळख', 'करून', 'त्यांची', 'नोंद', 'केली', '.', '\nहे', "'", 'पृथ्वीच्या', 'जवळच्या', 'वस्तू', "'", 'किंवा', 'NEO', 'म्हणून', 'ओळखले', 'जातात', ',', 'ज्यांचे', 'आकार', 'पर्वतांप्रमाणे', 'आहेत', 'आणि', 'यात', 'पृथ्वीच्या', 'कक्षेत', '50', 'दशलक्ष', 'किलोमीटरच्या', 'आत', 'असलेली', 'कोणतीही', 'वस्तू', 'समाविष्ट', 'आहे', '.', '\nनोंद', 'करण्यासाठी', 'अंदाजे', '50', 'उरले', 'असल्यास', ',', 'नासाचे', 'म्हणणे', 'आहे', 'की', 'त्याला', 'माहित', 'असलेल्या', '887', 'मध्ये', 'कोणताही', 'ग्रहासाठी', 'महत्त्वाचा', 'धोका', 'नाही', '.', '\nआता', 'नासा', 'काही', 'लहान', 'क्षुद्रग्रहांची', 'नोंद', 'करण्यासाठी', 'काम', 'करत', 'आहे', ',', 'ज्याची', 'रुंदी', '140', 'मीटर', 'किंवा', 'त्याहून', 'अधिक', 'आहे', '.', '\nया', 'आकाराच्या', '25,000', 'क्षुद्रग्रहांपैकी', ',', 'आतापर्यंत', 'सुमारे', '8,000', 'नोंदवले', 'गेले', 'आहेत', ',', 'तर', '17,000', '

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:52:49 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json


Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/tokenize/ufal.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/mwt/ufal.pt:   0%|          | 0…

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/pos/ufal_charlm.pt:   0%|      …

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/lemma/ufal_nocharlm.pt:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/depparse/ufal_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/sentiment/l3cube_charlm.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/ner/l3cube.pt:   0%|          |…

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/forward_charlm/l3cube.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/backward_charlm/l3cube.pt:   0%…

Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.9.0/models/pretrain/fasttextwiki.pt:   0%|…

2024-11-05 23:57:06 INFO: Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

2024-11-05 23:57:06 INFO: Using device: cpu
2024-11-05 23:57:06 INFO: Loading: tokenize
2024-11-05 23:57:06 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:57:06 INFO: Loading: pos
2024-11-05 23:57:07 INFO: Loading: lemma
2024-11-05 23:57:07 INFO: Loading: depparse
2024-11-05 23:57:08 INFO: Loading: sentiment
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:57:09 INFO: Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-05 23:57:11 INFO: Done loading processors!


NASA Marathi Lemmas: ['2010', 'मधे', ',', 'नास', 'चा', 'ग्रह', 'संरक्षण', 'टीम', 'पृथ्वी', 'चा', 'जवळ', '1', 'किमी', 'रुंद', '90', 'टक्के', 'क्षुद्र', 'चा', 'ओळख', 'करणे', 'तो', 'चा', 'नोंद', 'करणे', '.', 'हा', "'पृथ्वी", 'चा', 'जवळ', 'चा', "वस्तू'", 'किंवा', 'NEO', 'म्हणून', 'ओळखणे', 'जाणे', ',', 'जा', 'चा', 'आकार', 'पर्व', 'प्राणे', 'असणे', 'आणि', 'यात', 'पृथ्वी', 'चा', 'कक्षेत', '50', 'दशलक्ष', 'किलोमी', 'आत', 'असणे', 'कोणती', 'हा', 'वस्तू', 'समाविष्ट', 'असणे', '.', 'नोंद', 'करणे', 'साठी', 'अंदाजे', '50', 'उरणे', 'असल्यास', ',', 'नास', 'चा', 'म्हणणे', 'असणे', 'की', 'तो', 'माहित', 'असलेल्या', '887', 'मधे', 'कोणता', 'ही', 'ग्रहास', 'ठी', 'महत्त्वा', 'चा', 'धोक', 'असणे', '.', 'आता', 'नास', 'काही', 'लहान', 'क्षुद्र', 'चा', 'नोंद', 'करणे', 'साठी', 'काम', 'करणे', 'असणे', ',', 'जा', 'चा', 'रुंदी', '140', 'मीटर', 'किंवा', 'त्याहून', 'अधिक', 'असणे', '.', 'हा', 'आकार', 'चा', '25', ',', '000', 'क्षुद्र', ',', 'आतापर्यं', 'तत', 'सुमार', '8,', '000', 'नोंदवले', 'जाणे', 'असणे', ',', 'तर', '17,', 

In [6]:
# Import necessary libraries
from indicnlp.tokenize import indic_tokenize
import stanza
import re

# Download necessary resources for Marathi
# Uncomment the following line if you haven't downloaded the model yet
# stanza.download('mr')

# Define the Marathi text for processing
text_for_processing = """
2010 मध्ये, नासाच्या ग्रह संरक्षण टीमने पृथ्वीच्या जवळ 1 किमी रुंद 90 टक्के क्षुद्रग्रहांची ओळख करून त्यांची नोंद केली.
हे 'पृथ्वीच्या जवळच्या वस्तू' किंवा NEO म्हणून ओळखले जातात, ज्यांचे आकार पर्वतांप्रमाणे आहेत आणि यात पृथ्वीच्या कक्षेत 50 दशलक्ष किलोमीटरच्या आत असलेली कोणतीही वस्तू समाविष्ट आहे.
नोंद करण्यासाठी अंदाजे 50 उरले असल्यास, नासाचे म्हणणे आहे की त्याला माहित असलेल्या 887 मध्ये कोणताही ग्रहासाठी महत्त्वाचा धोका नाही.
आता नासा काही लहान क्षुद्रग्रहांची नोंद करण्यासाठी काम करत आहे, ज्याची रुंदी 140 मीटर किंवा त्याहून अधिक आहे.
या आकाराच्या 25,000 क्षुद्रग्रहांपैकी, आतापर्यंत सुमारे 8,000 नोंदवले गेले आहेत, तर 17,000 टाकले गेले आहेत.
2013 मध्ये रशियाच्या चेल्याबिंस्क शहराच्या वर 19 मीटरच्या क्षुद्रग्रहाचा विस्फोट झाला होता, ज्यात 1,200 लोक जखमी झाले होते, त्यामुळे हे मध्यम आकाराचे क्षुद्रग्रह पृथ्वीच्या कक्षेत प्रवेश केल्यास गंभीर धोका बनू शकतात.
"""

# Initialize Stanza Pipeline for Marathi
nlp_marathi = stanza.Pipeline('mr')

# Process the text using Stanza
doc = nlp_marathi(text_for_processing)

# 1. Morphological Analysis
print("Morphological Analysis:")
for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}, Lemma: {word.lemma}, POS: {word.pos}, Morphology: {word.feats}")

# 2. POS Tagging (Rule-Based)
print("\nPOS Tagging (Stanza + Rule-Based):")
for sentence in doc.sentences:
    for word in sentence.words:
        pos_tag = word.pos
        # Basic rule-based adjustments if needed
        if word.text in ["आहे", "आहेत"]:  # Recognize auxiliary verbs
            pos_tag = "AUX"
        print(f"Word: {word.text}, POS Tag: {pos_tag}")

# 3. Chunking (Noun and Verb Phrases)
print("\nChunking (Noun and Verb Phrases):")
chunked_sentences = []
for sentence in doc.sentences:
    chunked_sentence = []
    current_chunk = []
    for word in sentence.words:
        if word.pos in ["NOUN", "PROPN"]:  # Noun Phrase
            current_chunk.append(word.text)
        elif current_chunk:
            chunked_sentence.append(("NP", " ".join(current_chunk)))
            current_chunk = []
        if word.pos in ["VERB", "AUX"]:  # Verb Phrase
            chunked_sentence.append(("VP", word.text))
    if current_chunk:  # Append any remaining noun phrase
        chunked_sentence.append(("NP", " ".join(current_chunk)))
    chunked_sentences.append(chunked_sentence)

# Display chunked sentences
for i, chunks in enumerate(chunked_sentences):
    print(f"Sentence {i+1} Chunks:")
    for chunk_type, chunk_text in chunks:
        print(f"  {chunk_type}: {chunk_text}")

# 4. Named Entity Recognition (NER)
# Predefined lists of known entities specific to the text
KNOWN_LOCATIONS = ["पृथ्वी", "चेल्याबिंस्क", "रूस"]
KNOWN_ORGANIZATIONS = ["नासा"]
KNOWN_ENTITY_TYPES = ["क्षुद्रग्रह", "ग्रह", "कक्षा", "वस्तू"]

# Rule-based NER function
def rule_based_ner(text):
    entities = []

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text:
            entities.append({'entity': location, 'type': 'LOCATION'})

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text:
            entities.append({'entity': org, 'type': 'ORGANIZATION'})

    # Match other domain-specific entity types
    for entity_type in KNOWN_ENTITY_TYPES:
        if entity_type in text:
            entities.append({'entity': entity_type, 'type': 'DOMAIN_ENTITY'})

    # Regex pattern for detecting years and large numbers (e.g., 2010, 50 million, 25,000)
    years = re.findall(r'\b\d{4}\b', text)
    for year in years:
        entities.append({'entity': year, 'type': 'DATE'})

    # Regex for large numbers
    large_numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)
    for number in large_numbers:
        entities.append({'entity': number, 'type': 'QUANTITY'})

    return entities

# Perform rule-based NER
ner_results = rule_based_ner(text_for_processing)

# Print recognized entities
print("Recognized Entities:")
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")


2024-11-05 23:57:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:57:18 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json
2024-11-05 23:57:22 INFO: Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

2024-11-05 23:57:22 INFO: Using device: cpu
2024-11-05 23:57:22 INFO: Loading: tokenize
2024-11-05 23:57:22 INFO: Loading: mwt
2024-11-05 23:57:22 INFO: Loading: pos
2024-11-05 23:57:23 INFO: Loading: lemma
2024-11-05 23:57:23 INFO: Loading: depparse
2024-11-05 23:57:24 INFO: Loading: sentiment
2024-11-05 23:57:25 INFO: Loading: ner
2024-11-05 23:57:27 INFO: Done loading processors!


Morphological Analysis:
Word: 2010, Lemma: 2010, POS: PROPN, Morphology: Case=Nom|Gender=Masc|Number=Sing
Word: मध्ये, Lemma: मधे, POS: ADV, Morphology: None
Word: ,, Lemma: ,, POS: PUNCT, Morphology: None
Word: नासा, Lemma: नास, POS: PROPN, Morphology: Case=Abs|Gender=Fem|Number=Sing
Word: च्या, Lemma: चा, POS: ADP, Morphology: Case=Abs
Word: ग्रह, Lemma: ग्रह, POS: NOUN, Morphology: Case=Nom|Gender=Masc|Number=Sing
Word: संरक्षण, Lemma: संरक्षण, POS: NOUN, Morphology: Case=Nom|Gender=Neut|Number=Sing
Word: टीमने, Lemma: टीम, POS: NOUN, Morphology: Case=Ins|Gender=Masc|Number=Sing
Word: पृथ्वी, Lemma: पृथ्वी, POS: NOUN, Morphology: Case=Abs|Gender=Fem|Number=Sing
Word: च्या, Lemma: चा, POS: PART, Morphology: None
Word: जवळ, Lemma: जवळ, POS: ADP, Morphology: None
Word: 1, Lemma: 1, POS: NUM, Morphology: None
Word: किमी, Lemma: किमी, POS: NUM, Morphology: None
Word: रुंद, Lemma: रुंद, POS: NOUN, Morphology: Case=Acc|Gender=Masc|Number=Plur
Word: 90, Lemma: 90, POS: NUM, Morphology: None

# Gujrati

In [7]:
# Install necessary libraries (uncomment these lines to install in your environment)
# !pip install indic-nlp-library

# Import libraries
from indicnlp.tokenize import indic_tokenize

# Define stopwords for Gujarati directly in the code
gujarati_stopwords = [
    "છે", "છે", "હોય", "હોય", "તમે", "કેમ", "પણ", "અને", "કે", "કેવું",
    "આ", "આવ", "તે", "તેને", "કેવી", "થવા", "છે", "ની", "કેટલાક", "જો",
    "ત્યારે", "ફરી", "કિંવાં", "હવે", "વસે", "આમ", "આય", "તન", "નહીં",
    "યે", "કરવા", "કરતા", "કરતી", "જ્યાં", "આજ", "નવો", "દૂર", "જાણવા"
]

# Define the NASA-related Gujarati text
nasa_gujarati_text = """
2010માં, નાસાની ગ્રહ સંરક્ષણ ટીમે પૃથ્વીના નજીક 1 કિમી પહોળા 90 ટકાના ક્ષુદ્રગ્રહોની ઓળખ કરી અને તેમનો નોટ કર્યો.
આને 'પૃથ્વીના નજીકની વસ્તુઓ' અથવા NEO તરીકે ઓળખવામાં આવે છે, જે પહાડોના કદની છે અને તેમાં પૃથ્વીના કક્ષામાં 50 મિલિયન કિલોમીટર દ્વારા આવેલું કોઈ પણ વસ્તુ સમાવિષ્ટ છે.
નોંધ લેવા માટે અંદાજે 50 બચ્યા છે, નાસાનું કહેવું છે કે તે જાણતા 887 માંથી કોઈપણ ગ્રહ માટે મહત્ત્વનો ખતરો નથી.
હવે નાસા કેટલાક નાના ક્ષુદ્રગ્રહોને નોંધવા માટે કામ કરી રહ્યો છે, જેણે પહોળાઈ 140 મીટર કે તેથી વધુ છે.
આ કદના 25,000 ક્ષુદ્રગ્રહોમાંથી, અત્યાર સુધીમાં લગભગ 8,000 નોંધાયા છે, 17,000 છોડાઈ ગયા છે.
2013 માં રશિયાના ચેલ્યાબિન્સ્ક શહેરની ઉપર 19 મીટર ના ક્ષુદ્રગ્રહમાં વિસ્ફોટ થયો હતો, જેમાં 1,200 લોકો ઘાયલ થયા હતા, તેથી આ મધ્યમ કદના ક્ષુદ્રગ્રહો પૃથ્વીની કક્ષામાં પ્રવેશ કરે તો ગંભીર ખતરો બની શકે છે.
"""

# Tokenization using Indic NLP for Gujarati
nasa_gujarati_tokens = indic_tokenize.trivial_tokenize(nasa_gujarati_text, lang='gu')
print("NASA Gujarati Tokens:", nasa_gujarati_tokens)

# Filter out Gujarati stopwords
filtered_nasa_gujarati_tokens = [token for token in nasa_gujarati_tokens if token not in gujarati_stopwords]
print("Filtered NASA Gujarati Tokens:", filtered_nasa_gujarati_tokens)


NASA Gujarati Tokens: ['\n2010માં', ',', 'નાસાની', 'ગ્રહ', 'સંરક્ષણ', 'ટીમે', 'પૃથ્વીના', 'નજીક', '1', 'કિમી', 'પહોળા', '90', 'ટકાના', 'ક્ષુદ્રગ્રહોની', 'ઓળખ', 'કરી', 'અને', 'તેમનો', 'નોટ', 'કર્યો', '.', '\nઆને', "'", 'પૃથ્વીના', 'નજીકની', 'વસ્તુઓ', "'", 'અથવા', 'NEO', 'તરીકે', 'ઓળખવામાં', 'આવે', 'છે', ',', 'જે', 'પહાડોના', 'કદની', 'છે', 'અને', 'તેમાં', 'પૃથ્વીના', 'કક્ષામાં', '50', 'મિલિયન', 'કિલોમીટર', 'દ્વારા', 'આવેલું', 'કોઈ', 'પણ', 'વસ્તુ', 'સમાવિષ્ટ', 'છે', '.', '\nનોંધ', 'લેવા', 'માટે', 'અંદાજે', '50', 'બચ્યા', 'છે', ',', 'નાસાનું', 'કહેવું', 'છે', 'કે', 'તે', 'જાણતા', '887', 'માંથી', 'કોઈપણ', 'ગ્રહ', 'માટે', 'મહત્ત્વનો', 'ખતરો', 'નથી', '.', '\nહવે', 'નાસા', 'કેટલાક', 'નાના', 'ક્ષુદ્રગ્રહોને', 'નોંધવા', 'માટે', 'કામ', 'કરી', 'રહ્યો', 'છે', ',', 'જેણે', 'પહોળાઈ', '140', 'મીટર', 'કે', 'તેથી', 'વધુ', 'છે', '.', '\nઆ', 'કદના', '25,000', 'ક્ષુદ્રગ્રહોમાંથી', ',', 'અત્યાર', 'સુધીમાં', 'લગભગ', '8,000', 'નોંધાયા', 'છે', ',', '17,000', 'છોડાઈ', 'ગયા', 'છે', '.', '\n2013', 'માં', 'રશિયાના

In [10]:
# Import necessary libraries
from indicnlp.tokenize import indic_tokenize
import stanza
import re

# Download necessary resources for Gujarati
# Uncomment the following line if you haven't downloaded the model yet
# stanza.download('gu')

# Define the Gujarati text for processing
text_for_processing = """
2010 માં, નાસાના ગ્રહ સંરક્ષણ ટીમે પૃથ્વીની નજીક 1 કિમી પહોળા 90 ટકાના ક્ષુદ્રગ્રહોને ઓળખ્યા અને તેમનો નોટ કર્યો.
આને 'પૃથ્વીના નજીકની વસ્તુઓ' અથવા NEO તરીકે ઓળખવામાં આવે છે, જે પહાડોના કદની છે અને તેમાં પૃથ્વીની કક્ષામાં 50 મિલિયન કિલોમીટર દ્વારા આવેલું કોઈ પણ વસ્તુ સમાવિષ્ટ છે.
નોંધ લેવા માટે અંદાજે 50 બચ્યા છે, નાસાનું કહેવું છે કે તે જાણતા 887 માંથી કોઈપણ ગ્રહ માટે મહત્ત્વનો ખતરો નથી.
હવે નાસા કેટલાક નાના ક્ષુદ્રગ્રહોને નોંધવા માટે કામ કરી રહ્યો છે, જેણે પહોળાઈ 140 મીટર કે તેથી વધુ છે.
આ કદના 25,000 ક્ષુદ્રગ્રહોમાંથી, અત્યાર સુધીમાં લગભગ 8,000 નોંધાયા છે, 17,000 છોડાઈ ગયા છે.
2013 માં રશિયાના ચેલ્યાબિન્સ્ક શહેરની ઉપર 19 મીટર ના ક્ષુદ્રગ્રહમાં વિસ્ફોટ થયો હતો, જેમાં 1,200 લોકો ઘાયલ થયા હતા, તેથી આ મધ્યમ કદના ક્ષુદ્રગ્રહો પૃથ્વીની કક્ષામાં પ્રવેશ કરે તો ગંભીર ખતરો બની શકે છે.
"""

# Initialize Stanza Pipeline for Gujarati
nlp_gujarati = stanza.Pipeline('gu')

# Process the text using Stanza
doc = nlp_gujarati(text_for_processing)

# 1. Morphological Analysis
print("Morphological Analysis:")
for sentence in doc.sentences:
    for word in sentence.words:
        print(f"Word: {word.text}, Lemma: {word.lemma}, POS: {word.pos}, Morphology: {word.feats}")

# 2. POS Tagging (Rule-Based)
print("\nPOS Tagging (Stanza + Rule-Based):")
for sentence in doc.sentences:
    for word in sentence.words:
        pos_tag = word.pos
        # Basic rule-based adjustments if needed
        if word.text in ["છે", "છે"]:  # Recognize auxiliary verbs
            pos_tag = "AUX"
        print(f"Word: {word.text}, POS Tag: {pos_tag}")

# 3. Chunking (Noun and Verb Phrases)
print("\nChunking (Noun and Verb Phrases):")
chunked_sentences = []
for sentence in doc.sentences:
    chunked_sentence = []
    current_chunk = []
    for word in sentence.words:
        if word.pos in ["NOUN", "PROPN"]:  # Noun Phrase
            current_chunk.append(word.text)
        elif current_chunk:
            chunked_sentence.append(("NP", " ".join(current_chunk)))
            current_chunk = []
        if word.pos in ["VERB", "AUX"]:  # Verb Phrase
            chunked_sentence.append(("VP", word.text))
    if current_chunk:  # Append any remaining noun phrase
        chunked_sentence.append(("NP", " ".join(current_chunk)))
    chunked_sentences.append(chunked_sentence)

# Display chunked sentences
for i, chunks in enumerate(chunked_sentences):
    print(f"Sentence {i+1} Chunks:")
    for chunk_type, chunk_text in chunks:
        print(f"  {chunk_type}: {chunk_text}")

# 4. Named Entity Recognition (NER)
# Predefined lists of known entities specific to the text
KNOWN_LOCATIONS = ["પૃથ્વી", "ચેલ્યાબિન્સ્ક", "રશિયા"]
KNOWN_ORGANIZATIONS = ["નાસા"]
KNOWN_ENTITY_TYPES = ["ક્ષુદ્રગ્રહ", "ગ્રહ", "કક્ષા", "વસ્તુ"]

# Rule-based NER function
def rule_based_ner(text):
    entities = []

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text:
            entities.append({'entity': location, 'type': 'LOCATION'})

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text:
            entities.append({'entity': org, 'type': 'ORGANIZATION'})

    # Match other domain-specific entity types
    for entity_type in KNOWN_ENTITY_TYPES:
        if entity_type in text:
            entities.append({'entity': entity_type, 'type': 'DOMAIN_ENTITY'})

    # Regex pattern for detecting years and large numbers (e.g., 2010, 50 million, 25,000)
    years = re.findall(r'\b\d{4}\b', text)
    for year in years:
        entities.append({'entity': year, 'type': 'DATE'})

    # Regex for large numbers
    large_numbers = re.findall(r'\b\d{1,3}(?:,\d{3})*\b', text)
    for number in large_numbers:
        entities.append({'entity': number, 'type': 'QUANTITY'})

    return entities

# Perform rule-based NER
ner_results = rule_based_ner(text_for_processing)

# Print recognized entities
print("Recognized Entities:")
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")


2024-11-05 23:58:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-05 23:58:40 INFO: Downloaded file to C:\Users\91636\stanza_resources\resources.json


ValueError: No processors to load for language gu.  Language gu is currently unsupported