## **To study Preprocessing of text (Tokenization, Filtration, Script Validation, Stop Word Removal, Stemming)**

# NLTK library

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
wl = WordNetLemmatizer()

# Tokenization

In [None]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
print(words)

['It', 'is', 'important', 'to', 'by', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'once', '.']


# Stemming

In [None]:
ps  = PorterStemmer()
for w in words:
    print(ps.stem(w))


it
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


# Lemmtization

In [None]:
for w in words:
    print(wl.lemmatize(w))

It
is
important
to
by
very
pythonly
while
you
are
pythoning
with
python
.
All
pythoners
have
pythoned
poorly
at
least
once
.


# Sentence Tokenization

In [None]:
EXAMPLE_TEXT = "Hello Mr. Paul, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Paul, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


# **Regional language**

# Indic and Stanza- Hindi

In [None]:
# Install necessary libraries
!pip install indic-nlp-library
!pip install stanza

# Import libraries
from indicnlp.tokenize import indic_tokenize
import stanza

# Download the Stanza model for Hindi
stanza.download('hi')

# Define stopwords for Hindi directly in the code
hindi_stopwords = [
    "है", "हैं", "हो", "था", "थी", "थे", "और", "पर", "के", "को", "से", "केलिए", "तक",
    "का", "की", "के", "हूँ", "हमें", "तुम", "यह", "वह", "जो", "किस", "कौन", "कैसे",
    "मैं", "हम", "तुम्हें", "यहां", "वहां", "नहीं", "क्या", "कब", "क्यों", "क्योंकि"
]

# Define the Hindi text
hindi_text = "प्राकृतिक भाषा प्रसंस्करण भाषा विज्ञान, कंप्यूटर विज्ञान और कृत्रिम बुद्धिमत्ता का एक उपक्षेत्र है, जो कंप्यूटर और मानव भाषा के बीच पारस्परिक क्रियाओं से संबंधित है।"

# Tokenization using Indic NLP for Hindi
hindi_tokens = indic_tokenize.trivial_tokenize(hindi_text, lang='hi')
print("Hindi Tokens:", hindi_tokens)

# Stanza Pipeline for Hindi
nlp_hindi = stanza.Pipeline('hi')
doc_hindi = nlp_hindi(hindi_text)
hindi_lemmas = [word.lemma for sent in doc_hindi.sentences for word in sent.words]
print("Hindi Lemmas:", hindi_lemmas)

# Filter out Hindi stopwords
filtered_hindi_tokens = [token for token in hindi_tokens if token not in hindi_stopwords]
print("Filtered Hindi Tokens:", filtered_hindi_tokens)




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...


Downloading https://huggingface.co/stanfordnlp/stanza-hi/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'और', 'कृत्रिम', 'बुद्धिमत्ता', 'का', 'एक', 'उपक्षेत्र', 'है', ',', 'जो', 'कंप्यूटर', 'और', 'मानव', 'भाषा', 'के', 'बीच', 'पारस्परिक', 'क्रियाओं', 'से', 'संबंधित', 'है', '।']


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Hindi Lemmas: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'और', 'कृत्रिम', 'बुद्धिमत्ता', 'का', 'एक', 'उपक्षेत्र', 'है', ',', 'जो', 'कंप्यूटर', 'और', 'मानव', 'भाषा', 'का', 'बीच', 'पारस्परिक', 'क्रिया', 'से', 'संबंधित', 'है', '।']
Filtered Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'कृत्रिम', 'बुद्धिमत्ता', 'एक', 'उपक्षेत्र', ',', 'कंप्यूटर', 'मानव', 'भाषा', 'बीच', 'पारस्परिक', 'क्रियाओं', 'संबंधित', '।']


# Indic and Stanza-Marathi

In [None]:
# Install necessary libraries
!pip install indic-nlp-library
!pip install stanza

# Import libraries
from indicnlp.tokenize import indic_tokenize
import stanza

# Download the Stanza model for Marathi
stanza.download('mr')

# Define stopwords for Marathi directly in the code
marathi_stopwords = [
    "आहे", "आहेत", "होते", "होता", "असे", "मध्ये", "आणि", "किंवा", "तर", "पण",
    "हे", "ही", "का", "नाही", "काय", "मी", "आम्ही", "तुम्ही", "तो", "ती", "ते",
    "यांचा", "यांची", "त्यांचा", "त्यांची", "यात", "त्यात", "मध्ये", "म्हणून"
]

# Define the Marathi text
marathi_text = "नैसर्गिक भाषा प्रक्रिया भाषा विज्ञान, संगणक विज्ञान आणि कृत्रिम बुद्धिमत्ता यांचा उपक्षेत्र आहे, जो संगणक आणि मानवी भाषेतील संवादाशी संबंधित आहे."

# Tokenization using Indic NLP for Marathi
marathi_tokens = indic_tokenize.trivial_tokenize(marathi_text, lang='mr')
print("Marathi Tokens:", marathi_tokens)

# Stanza Pipeline for Marathi
nlp_marathi = stanza.Pipeline('mr')
doc_marathi = nlp_marathi(marathi_text)
marathi_lemmas = [word.lemma for sent in doc_marathi.sentences for word in sent.words]
print("Marathi Lemmas:", marathi_lemmas)

# Filter out Marathi stopwords
filtered_marathi_tokens = [token for token in marathi_tokens if token not in marathi_stopwords]
print("Filtered Marathi Tokens:", filtered_marathi_tokens)





Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: mr (Marathi) ...
INFO:stanza:File exists: /root/stanza_resources/mr/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Marathi Tokens: ['नैसर्गिक', 'भाषा', 'प्रक्रिया', 'भाषा', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'आणि', 'कृत्रिम', 'बुद्धिमत्ता', 'यांचा', 'उपक्षेत्र', 'आहे', ',', 'जो', 'संगणक', 'आणि', 'मानवी', 'भाषेतील', 'संवादाशी', 'संबंधित', 'आहे', '.']


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Marathi Lemmas: ['नैसर्गिक', 'भाष', 'प्रक्र', 'भाष', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'आणि', 'कृत्रिम', 'बुद्धिमत्त', 'हा', 'चा', 'उपक्षेत्र', 'असणे', ',', 'जो', 'संगणक', 'आणि', 'मानवी', 'भाषेती', 'संवाद', 'संबंधणे', 'असणे', '.']
Filtered Marathi Tokens: ['नैसर्गिक', 'भाषा', 'प्रक्रिया', 'भाषा', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'कृत्रिम', 'बुद्धिमत्ता', 'उपक्षेत्र', ',', 'जो', 'संगणक', 'मानवी', 'भाषेतील', 'संवादाशी', 'संबंधित', '.']


# Indic -Gujarati




In [None]:
# Install necessary libraries
!pip install indic-nlp-library

# Import libraries
from indicnlp.tokenize import indic_tokenize

# Define stopwords for Gujarati directly in the code
gujarati_stopwords = [
    "છે", "છેવું", "અન્ય", "પણ", "આ", "કે", "અને", "એ", "તે", "જો", "માત્ર",
    "બધી", "બહુ", "તેમ", "જ", "થઈ", "શકે", "ને", "તેના", "તેમણે", "શું", "નથી",
    "હું", "અમે", "તમે", "તે"
]

# Define the Gujarati text
gujarati_text = "પ્રાકૃતિક ભાષા પ્રક્રિયા ભાષા વિજ્ઞાન, કમ્પ્યુટર વિજ્ઞાન અને કૃત્રિમ બુદ્ધિમત્તાનો એક ઉપક્ષેત્ર છે, જે કમ્પ્યુટર અને માનવી ભાષા વચ્ચેની ક્રિયાઓ સાથે સંબંધિત છે."

# Tokenization using Indic NLP for Gujarati
gujarati_tokens = indic_tokenize.trivial_tokenize(gujarati_text, lang='gu')
print("Gujarati Tokens:", gujarati_tokens)

# Filter out Gujarati stopwords
filtered_gujarati_tokens = [token for token in gujarati_tokens if token not in gujarati_stopwords]
print("Filtered Gujarati Tokens:", filtered_gujarati_tokens)


Gujarati Tokens: ['પ્રાકૃતિક', 'ભાષા', 'પ્રક્રિયા', 'ભાષા', 'વિજ્ઞાન', ',', 'કમ્પ્યુટર', 'વિજ્ઞાન', 'અને', 'કૃત્રિમ', 'બુદ્ધિમત્તાનો', 'એક', 'ઉપક્ષેત્ર', 'છે', ',', 'જે', 'કમ્પ્યુટર', 'અને', 'માનવી', 'ભાષા', 'વચ્ચેની', 'ક્રિયાઓ', 'સાથે', 'સંબંધિત', 'છે', '.']
Filtered Gujarati Tokens: ['પ્રાકૃતિક', 'ભાષા', 'પ્રક્રિયા', 'ભાષા', 'વિજ્ઞાન', ',', 'કમ્પ્યુટર', 'વિજ્ઞાન', 'કૃત્રિમ', 'બુદ્ધિમત્તાનો', 'એક', 'ઉપક્ષેત્ર', ',', 'જે', 'કમ્પ્યુટર', 'માનવી', 'ભાષા', 'વચ્ચેની', 'ક્રિયાઓ', 'સાથે', 'સંબંધિત', '.']


# **To perform Morphological Analysis**

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize , sent_tokenize
import re
import numpy as np
import pandas as pd
text = "Barack Obama was the 44th President of the United States."
def cln(text):
    text = text.lower()
    text = re.sub(r"[^a-z]"," ",text)
    text = word_tokenize(text)
    for i in text:
        if i in stopwords.words("english"):
            text.remove(i)
    return text
tok_text = cln(text)
print(tok_text)
def ste(tok_text):
  z= []
  ps = PorterStemmer()
  for i in tok_text:
    stem = ps.stem(i)
    z.append(stem)
  return z
stemmed_text = ste(tok_text)
print(stemmed_text)

def lem(tok_text):
  z= []
  Le = WordNetLemmatizer()
  for i in tok_text:
    lemma = Le.lemmatize(i)
    z.append(lemma)
  return z
lemmatized_text = lem(tok_text)
print(lemmatized_text)


['barack', 'obama', 'the', 'th', 'president', 'the', 'united', 'states']
['barack', 'obama', 'the', 'th', 'presid', 'the', 'unit', 'state']
['barack', 'obama', 'the', 'th', 'president', 'the', 'united', 'state']


# Morphological Analysis-Hindi

In [None]:
# Install necessary libraries
!pip install indic-nlp-library stanza
# Import necessary libraries
from indicnlp.tokenize import indic_tokenize
import stanza
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download Stanza model for Hindi
stanza.download('hi')

# Define Hindi stopwords directly
hindi_stopwords = [
    "है", "हैं", "हो", "था", "थी", "थे", "और", "पर", "के", "को", "से", "केलिए", "तक",
    "का", "की", "के", "हूँ", "हमें", "तुम", "यह", "वह", "जो", "किस", "कौन", "कैसे",
    "मैं", "हम", "तुम्हें", "यहां", "वहां", "नहीं", "क्या", "कब", "क्यों", "क्योंकि"
]

# Sample Hindi text
hindi_text = "प्राकृतिक भाषा प्रसंस्करण भाषा विज्ञान, कंप्यूटर विज्ञान और कृत्रिम बुद्धिमत्ता का एक उपक्षेत्र है।"

# Tokenization
hindi_tokens = indic_tokenize.trivial_tokenize(hindi_text, lang='hi')
print("Hindi Tokens:", hindi_tokens)

# Define stemming function
def stem_tokens(tok_text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in tok_text]

# Define lemmatization function using Stanza
def lemmatize_tokens(tok_text, lang='hi'):
    nlp = stanza.Pipeline(lang)
    doc = nlp(" ".join(tok_text))
    return [word.lemma for sent in doc.sentences for word in sent.words]

# Stemming
stemmed_hindi = stem_tokens(hindi_tokens)
print("Stemmed Hindi Tokens:", stemmed_hindi)

# Lemmatization
lemmatized_hindi = lemmatize_tokens(hindi_tokens, lang='hi')
print("Lemmatized Hindi Tokens:", lemmatized_hindi)

# Stopword removal
filtered_hindi_tokens = [token for token in hindi_tokens if token not in hindi_stopwords]
print("Filtered Hindi Tokens:", filtered_hindi_tokens)




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'और', 'कृत्रिम', 'बुद्धिमत्ता', 'का', 'एक', 'उपक्षेत्र', 'है', '।']
Stemmed Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'और', 'कृत्रिम', 'बुद्धिमत्ता', 'का', 'एक', 'उपक्षेत्र', 'है', '।']


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: depparse
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


Lemmatized Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'और', 'कृत्रिम', 'बुद्धिमत्ता', 'का', 'एक', 'उपक्षेत्र', 'है', '।']
Filtered Hindi Tokens: ['प्राकृतिक', 'भाषा', 'प्रसंस्करण', 'भाषा', 'विज्ञान', ',', 'कंप्यूटर', 'विज्ञान', 'कृत्रिम', 'बुद्धिमत्ता', 'एक', 'उपक्षेत्र', '।']


# Morphological Analysis-Marathi

In [None]:
# Define Marathi stopwords directly
marathi_stopwords = [
    "आहे", "आहेत", "होता", "होती", "होते", "आणि", "वर", "चे", "ला", "च्या", "ते", "कसे",
    "मी", "आम्ही", "तुम्ही", "हे", "तो", "ती", "हे", "काय", "कधी", "का", "नाही", "होय"
]

# Sample Marathi text
marathi_text = "प्राकृतिक भाषा प्रक्रिया भाषा विज्ञान, संगणक विज्ञान आणि कृत्रिम बुद्धिमत्ता हे एक उपक्षेत्र आहे।"

# Tokenization
marathi_tokens = indic_tokenize.trivial_tokenize(marathi_text, lang='mr')
print("Marathi Tokens:", marathi_tokens)

# Download Stanza model for Marathi
stanza.download('mr')

# Define stemming function
def stem_tokens(tok_text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in tok_text]

# Define lemmatization function using Stanza
def lemmatize_tokens(tok_text, lang='mr'):
    nlp = stanza.Pipeline(lang)
    doc = nlp(" ".join(tok_text))
    return [word.lemma for sent in doc.sentences for word in sent.words]

# Stemming
stemmed_marathi = stem_tokens(marathi_tokens)
print("Stemmed Marathi Tokens:", stemmed_marathi)

# Lemmatization
lemmatized_marathi = lemmatize_tokens(marathi_tokens, lang='mr')
print("Lemmatized Marathi Tokens:", lemmatized_marathi)

# Stopword removal
filtered_marathi_tokens = [token for token in marathi_tokens if token not in marathi_stopwords]
print("Filtered Marathi Tokens:", filtered_marathi_tokens)


Marathi Tokens: ['प्राकृतिक', 'भाषा', 'प्रक्रिया', 'भाषा', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'आणि', 'कृत्रिम', 'बुद्धिमत्ता', 'हे', 'एक', 'उपक्षेत्र', 'आहे', '।']


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: mr (Marathi) ...
INFO:stanza:File exists: /root/stanza_resources/mr/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Stemmed Marathi Tokens: ['प्राकृतिक', 'भाषा', 'प्रक्रिया', 'भाषा', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'आणि', 'कृत्रिम', 'बुद्धिमत्ता', 'हे', 'एक', 'उपक्षेत्र', 'आहे', '।']


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: mr (Marathi):
| Processor | Package       |
-----------------------------
| tokenize  | ufal          |
| mwt       | ufal          |
| pos       | ufal_charlm   |
| lemma     | ufal_nocharlm |
| depparse  | ufal_charlm   |
| sentiment | l3cube_charlm |
| ner       | l3cube        |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


Lemmatized Marathi Tokens: ['प्राकृतिक', 'भाष', 'प्रक्र', 'भाष', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'आणि', 'कृत्रिम', 'बुद्धिमत्त', 'हा', 'एक', 'उपक्षेत्र', 'असणे', '।']
Filtered Marathi Tokens: ['प्राकृतिक', 'भाषा', 'प्रक्रिया', 'भाषा', 'विज्ञान', ',', 'संगणक', 'विज्ञान', 'कृत्रिम', 'बुद्धिमत्ता', 'एक', 'उपक्षेत्र', '।']


# Morphological Analysis-Gujarati

In [None]:
# Define Gujarati stopwords directly
gujarati_stopwords = [
    "છે", "છેવું", "અન્ય", "પણ", "આ", "કે", "અને", "એ", "તે", "જો", "માત્ર",
    "બધી", "બહુ", "તેમ", "જ", "થઈ", "શકે", "ને", "તેના", "તેમણે", "શું", "નથી",
    "હું", "અમે", "તમે", "તે"
]

# Sample Gujarati text
gujarati_text = "પ્રાકૃતિક ભાષા પ્રક્રિયા ભાષા વિજ્ઞાન, કમ્પ્યુટર વિજ્ઞાન અને કૃત્રિમ બુદ્ધિમત્તાનો એક ઉપક્ષેત્ર છે।"

# Tokenization
gujarati_tokens = indic_tokenize.trivial_tokenize(gujarati_text, lang='gu')
print("Gujarati Tokens:", gujarati_tokens)

# Stemming (optional, but may not be as effective in Gujarati due to language differences)
def stem_tokens(tok_text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in tok_text]

# Stemming
stemmed_gujarati = stem_tokens(gujarati_tokens)
print("Stemmed Gujarati Tokens:", stemmed_gujarati)

# Stopword removal
filtered_gujarati_tokens = [token for token in gujarati_tokens if token not in gujarati_stopwords]
print("Filtered Gujarati Tokens:", filtered_gujarati_tokens)


Gujarati Tokens: ['પ્રાકૃતિક', 'ભાષા', 'પ્રક્રિયા', 'ભાષા', 'વિજ્ઞાન', ',', 'કમ્પ્યુટર', 'વિજ્ઞાન', 'અને', 'કૃત્રિમ', 'બુદ્ધિમત્તાનો', 'એક', 'ઉપક્ષેત્ર', 'છે', '।']
Stemmed Gujarati Tokens: ['પ્રાકૃતિક', 'ભાષા', 'પ્રક્રિયા', 'ભાષા', 'વિજ્ઞાન', ',', 'કમ્પ્યુટર', 'વિજ્ઞાન', 'અને', 'કૃત્રિમ', 'બુદ્ધિમત્તાનો', 'એક', 'ઉપક્ષેત્ર', 'છે', '।']
Filtered Gujarati Tokens: ['પ્રાકૃતિક', 'ભાષા', 'પ્રક્રિયા', 'ભાષા', 'વિજ્ઞાન', ',', 'કમ્પ્યુટર', 'વિજ્ઞાન', 'કૃત્રિમ', 'બુદ્ધિમત્તાનો', 'એક', 'ઉપક્ષેત્ર', '।']


# **Ngram Model**

# English NGram

In [None]:
# Install necessary libraries
!pip install nltk

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Download NLTK data
nltk.download('punkt')

def create_ngram_model(sentences, n):
    """
    Create an n-gram model from a list of sentences.

    Args:
        sentences (list of str): The sentences to use for the n-gram model.
        n (int): The value of n for the n-gram model.

    Returns:
        ngram_model (dict): The n-gram model as a dictionary.
    """
    ngram_model = defaultdict(lambda: defaultdict(lambda: 0))

    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        for ngram in ngrams(tokens, n):
            prefix, suffix = tuple(ngram[:-1]), ngram[-1]
            ngram_model[prefix][suffix] += 1

    # Convert counts to probabilities
    for prefix in ngram_model:
        total_count = float(sum(ngram_model[prefix].values()))
        for suffix in ngram_model[prefix]:
            ngram_model[prefix][suffix] /= total_count

    return ngram_model

def predict_next_word(ngram_model, sentence, n):
    """
    Predict the next word given a sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to predict the next word.
        n (int): The value of n for the n-gram model.

    Returns:
        next_word (str): The most probable next word.
    """
    tokens = word_tokenize(sentence.lower())

    if len(tokens) < n - 1:
        return "Not enough context to predict."

    prefix = tuple(tokens[-(n - 1):])

    if prefix not in ngram_model:
        return "No prediction available for this context."

    next_word = max(ngram_model[prefix], key=ngram_model[prefix].get)
    return next_word

def calculate_probability(ngram_model, sentence, n):
    """
    Calculate the probability of a given sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to calculate the probability.
        n (int): The value of n for the n-gram model.

    Returns:
        probability (float): The probability of the sentence.
    """
    tokens = word_tokenize(sentence.lower())
    probability = 1.0

    if len(tokens) < n:
        return 0.0

    for i in range(n - 1, len(tokens)):
        prefix = tuple(tokens[i - (n - 1):i])
        suffix = tokens[i]

        if prefix in ngram_model and suffix in ngram_model[prefix]:
            probability *= ngram_model[prefix][suffix]
        else:
            return 0.0

    return probability

def main():
    # Get user input
    n = int(input("Enter the value of n for the n-gram model: "))
    sentences = input("Enter the sentences for the corpus (separated by ';'): ").split(';')
    test_sentence = input("Enter the sentence to predict the next word: ")

    # Create n-gram model
    ngram_model = create_ngram_model(sentences, n)

    # Calculate probability of a test sentence
    probability = calculate_probability(ngram_model, test_sentence, n)
    print(f"Probability of '{test_sentence}': {probability}")

    # Predict the next word
    next_word = predict_next_word(ngram_model, test_sentence, n)
    print(f"Predicted next word for '{test_sentence}': {next_word}")

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the value of n for the n-gram model: 2
Enter the sentences for the corpus (separated by ';'): I love machine learning; Machine learning is fascinating; Natural language processing is a subfield of AI; AI is transforming industries.
Enter the sentence to predict the next word: I
Probability of 'I': 0.0
Predicted next word for 'I': love


## NGram-Hindi, Marathi and Gujarati

In [None]:
# Install necessary libraries
!pip install nltk

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Download NLTK data (you may need to run this only once)
nltk.download('punkt')

def create_ngram_model(sentences, n):
    """
    Create an n-gram model from a list of sentences.

    Args:
        sentences (list of str): The sentences to use for the n-gram model.
        n (int): The value of n for the n-gram model.

    Returns:
        ngram_model (dict): The n-gram model as a dictionary.
    """
    ngram_model = defaultdict(lambda: defaultdict(lambda: 0))

    for sentence in sentences:
        tokens = word_tokenize(sentence)  # No need to lower case for Hindi/Marathi/Gujarati
        for ngram in ngrams(tokens, n):
            prefix, suffix = tuple(ngram[:-1]), ngram[-1]
            ngram_model[prefix][suffix] += 1

    # Convert counts to probabilities
    for prefix in ngram_model:
        total_count = float(sum(ngram_model[prefix].values()))
        for suffix in ngram_model[prefix]:
            ngram_model[prefix][suffix] /= total_count

    return ngram_model

def predict_next_word(ngram_model, sentence, n):
    """
    Predict the next word given a sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to predict the next word.
        n (int): The value of n for the n-gram model.

    Returns:
        next_word (str): The most probable next word.
    """
    tokens = word_tokenize(sentence)

    if len(tokens) < n - 1:
        return "Not enough context to predict."

    prefix = tuple(tokens[-(n - 1):])

    if prefix not in ngram_model:
        return "No prediction available for this context."

    next_word = max(ngram_model[prefix], key=ngram_model[prefix].get)
    return next_word

def calculate_probability(ngram_model, sentence, n):
    """
    Calculate the probability of a given sentence based on the n-gram model.

    Args:
        ngram_model (dict): The n-gram model.
        sentence (str): The sentence for which to calculate the probability.
        n (int): The value of n for the n-gram model.

    Returns:
        probability (float): The probability of the sentence.
    """
    tokens = word_tokenize(sentence)
    probability = 1.0

    if len(tokens) < n:
        return 0.0

    for i in range(n - 1, len(tokens)):
        prefix = tuple(tokens[i - (n - 1):i])
        suffix = tokens[i]

        if prefix in ngram_model and suffix in ngram_model[prefix]:
            probability *= ngram_model[prefix][suffix]
        else:
            return 0.0

    return probability

def main():
    # Get user input
    n = int(input("Enter the value of n for the n-gram model: "))
    language = input("Enter the language (Hindi, Gujarati, Marathi): ").lower()

    if language == "hindi":
        sentences = input("Enter the Hindi sentences for the corpus (separated by ';'): ").split(';')
    elif language == "gujarati":
        sentences = input("Enter the Gujarati sentences for the corpus (separated by ';'): ").split(';')
    elif language == "marathi":
        sentences = input("Enter the Marathi sentences for the corpus (separated by ';'): ").split(';')
    else:
        print("Unsupported language.")
        return

    test_sentence = input("Enter the sentence to predict the next word: ")

    # Create n-gram model
    ngram_model = create_ngram_model(sentences, n)

    # Calculate probability of a test sentence
    probability = calculate_probability(ngram_model, test_sentence, n)
    print(f"Probability of '{test_sentence}': {probability}")

    # Predict the next word
    next_word = predict_next_word(ngram_model, test_sentence, n)
    print(f"Predicted next word for '{test_sentence}': {next_word}")

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the value of n for the n-gram model: 2
Enter the language (Hindi, Gujarati, Marathi): hindi
Enter the Hindi sentences for the corpus (separated by ';'): ुझे मशीन लर्निंग पसंद है; मशीन लर्निंग अद्भुत है; प्राकृतिक भाषा प्रोसेसिंग एक उपक्षेत्र है।
Enter the sentence to predict the next word: मशीन
Probability of 'मशीन': 0.0
Predicted next word for 'मशीन': लर्निंग


In [None]:
#hindi corpus
#मुझे मशीन लर्निंग पसंद है; मशीन लर्निंग अद्भुत है; प्राकृतिक भाषा प्रोसेसिंग एक उपक्षेत्र है।

#marathi corpus
#मला मशीन शिकणे आवडते; मशीन शिकणे आकर्षक आहे; नैसर्गिक भाषा प्रक्रिया एक उपक्षेत्र आहे।

#gujju corpus
#મને મશીન લર્નિંગ પસંદ છે; મશીન લર્નિંગ રોમાંચક છે; નૈતિક ભાષા પ્રક્રિયા એક ઉપવિષય છે।


# **POS Tagging**

## Rule Based- Hindi

In [None]:
# Hindi Stop Words Removal
hindi_stop_words = set([
    'और', 'म', 'हैं', 'वह', 'हम', 'यह', 'के', 'को', 'से', 'भी', 'पर', 'का', 'ने', 'आ', 'कर',
    'कि', 'हूं', 'सकते', 'सकती', 'होगा'
])

def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in hindi_stop_words]
    return ' '.join(filtered_words)

# Example usage for stop word removal
text = "मैं स्कूल जा रहा हूँ और वह भी आ रहे हैं।"
clean_text = remove_stop_words(text)
print("Clean Text:", clean_text)  # Output: "मैं स्कूल जा रहा हूँ आ रहे।"

# Hindi Stemming
def stem_word(word):
    # Basic example for Hindi stemming
    if word.endswith('ना'):
        return word[:-2]
    elif word.endswith('ी'):
        return word[:-1]
    elif word.endswith('ों'):
        return word[:-2]
    return word

def stem_text(text):
    words = text.split()
    stemmed_words = [stem_word(word) for word in words]
    return ' '.join(stemmed_words)

# Example usage for stemming
text = "करना करता करती कि या कर"
stemmed_text = stem_text(text)
print("Stemmed Text:", stemmed_text)  # Output: "कर करत करती कि या कर"

# Hindi Lemmatization
hindi_lemma_dict = {
    'करना': 'कर',
    'करते': 'कर',
    'किया': 'कर',
    'चलना': 'चल',
    'चला': 'चल'
}

def lemmatize_word(word):
    return hindi_lemma_dict.get(word, word)

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatize_word(word) for word in words]
    return ' '.join(lemmatized_words)

# Example usage for lemmatization
text = "करना करते किया चला"
lemmatized_text = lemmatize_text(text)
print("Lemmatized Text:", lemmatized_text)  # Output: "कर कर कर चल"

# Simple Rule Based POS Tagging
pos_tags = {
    'मैं': 'सर्वनाम',
    'स्कूल': 'संज्ञा',
    'जा': 'क्रिया',
    'रहा': 'क्रिया',
    'हूँ': 'क्रिया',
}

def pos_tagging(text):
    words = text.split()
    tagged_words = [(word, pos_tags.get(word, 'अन्य')) for word in words]
    return tagged_words

# Example usage for POS tagging
text = "मैं स्कूल जा रहा हूँ"
tagged_text = pos_tagging(text)
print("Tagged Text:", tagged_text)  # Output: [('मैं', 'सर्वनाम'), ('स्कूल', 'संज्ञा'), ('जा', 'क्रिया'), ('रहा', 'क्रिया'), ('हूँ', 'क्रिया')]

# Bigger Dictionary Example - Rule Based POS Tagging
pos_tags = {
    'जंगल': 'संज्ञा',
    'जीवन': 'संज्ञा',
    'का': 'सर्वनाम',
    'स्रोत': 'संज्ञा',
    'और': 'संयोजक',
    'प्रकृति': 'विशेषण',
    'धरोहर': 'संज्ञा',
    'हमारे': 'सर्वनाम',
    'है': 'क्रिया',
}

def pos_tag_sentence(sentence):
    words = sentence.split()
    tagged_words = [(word, pos_tags.get(word, 'अन्य')) for word in words]
    return tagged_words

# Test sentences for bigger dictionary
example_sentence = "जंगल जीवन का स्रोत है"
tagged_example_sentence = pos_tag_sentence(example_sentence)
print("Tagged Example Sentence:", tagged_example_sentence)

test_sentence = "प्राकृतिक जीवन जंगल में है"
tagged_test_sentence = pos_tag_sentence(test_sentence)
print("Tagged Test Sentence:", tagged_test_sentence)


Clean Text: मैं स्कूल जा रहा हूँ रहे हैं।
Stemmed Text: कर करता करत कि या कर
Lemmatized Text: कर कर कर चल
Tagged Text: [('मैं', 'सर्वनाम'), ('स्कूल', 'संज्ञा'), ('जा', 'क्रिया'), ('रहा', 'क्रिया'), ('हूँ', 'क्रिया')]
Tagged Example Sentence: [('जंगल', 'संज्ञा'), ('जीवन', 'संज्ञा'), ('का', 'सर्वनाम'), ('स्रोत', 'संज्ञा'), ('है', 'क्रिया')]
Tagged Test Sentence: [('प्राकृतिक', 'अन्य'), ('जीवन', 'संज्ञा'), ('जंगल', 'संज्ञा'), ('में', 'अन्य'), ('है', 'क्रिया')]


# Rule Based-Gujarati

In [None]:
# Gujarati Stop Words Removal
gujarati_stop_words = set([
    'અને', 'હું', 'છું', 'તે', 'તેને', 'તેના', 'મારા', 'મને', 'કરવા', 'જવા', 'જીતે',
    'કે', 'તો', 'પર', 'મુજબ'
])

def remove_gujarati_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in gujarati_stop_words]
    return ' '.join(filtered_words)

# Example usage for Gujarati stop word removal
gujarati_text = "હું આજે સ્કૂલ જવા જઈ રહ્યો છું અને તે પણ આવે છે."
clean_gujarati_text = remove_gujarati_stop_words(gujarati_text)
print("Clean Gujarati Text:", clean_gujarati_text)  # Output: "આજે સ્કૂલ જવા જઈ રહ્યો અને પણ આવે છે."

# Gujarati Stemming
def stem_gujarati_word(word):
    # Basic example for Gujarati stemming
    if word.endswith('વા'):
        return word[:-2]
    elif word.endswith('ી'):
        return word[:-1]
    elif word.endswith('ઓ'):
        return word[:-2]
    return word

def stem_gujarati_text(text):
    words = text.split()
    stemmed_words = [stem_gujarati_word(word) for word in words]
    return ' '.join(stemmed_words)

# Example usage for Gujarati stemming
gujarati_stem_text = "કરવા કરે છે"
stemmed_gujarati_text = stem_gujarati_text(gujarati_stem_text)
print("Stemmed Gujarati Text:", stemmed_gujarati_text)  # Output: "કર કરે છે"

# Gujarati Lemmatization
gujarati_lemma_dict = {
    'કરવા': 'કર',
    'કરતા': 'કર',
    'કર્યું': 'કર',
    'જવા': 'જવ',
    'ગઈ': 'જવ'
}

def lemmatize_gujarati_word(word):
    return gujarati_lemma_dict.get(word, word)

def lemmatize_gujarati_text(text):
    words = text.split()
    lemmatized_words = [lemmatize_gujarati_word(word) for word in words]
    return ' '.join(lemmatized_words)

# Example usage for Gujarati lemmatization
gujarati_lemma_text = "કરવા કરતા કર્યું જવું"
lemmatized_gujarati_text = lemmatize_gujarati_text(gujarati_lemma_text)
print("Lemmatized Gujarati Text:", lemmatized_gujarati_text)  # Output: "કર કર કર જવું"

# Gujarati POS Tagging
gujarati_pos_tags = {
    'હું': 'સર્વનામ',
    'સ્કૂલ': 'સંજ્ઞા',
    'જવા': 'ક્રિયા',
    'હું': 'ક્રિયા',
}

def pos_tag_gujarati(text):
    words = text.split()
    tagged_words = [(word, gujarati_pos_tags.get(word, 'અન્ય')) for word in words]
    return tagged_words

# Example usage for Gujarati POS tagging
gujarati_pos_text = "હું સ્કૂલ જવા જઈ રહ્યો છું"
tagged_gujarati_text = pos_tag_gujarati(gujarati_pos_text)
print("Tagged Gujarati Text:", tagged_gujarati_text)  # Output: [('હું', 'સર્વનામ'), ('સ્કૂલ', 'સંજ્ઞા'), ('જવા', 'ક્રિયા'), ('જઈ', 'અન્ય'), ('રહ્યો', 'અન્ય'), ('છું', 'ક્રિયા')]


Clean Gujarati Text: આજે સ્કૂલ જઈ રહ્યો પણ આવે છે.
Stemmed Gujarati Text: કર કરે છે
Lemmatized Gujarati Text: કર કર કર જવું
Tagged Gujarati Text: [('હું', 'ક્રિયા'), ('સ્કૂલ', 'સંજ્ઞા'), ('જવા', 'ક્રિયા'), ('જઈ', 'અન્ય'), ('રહ્યો', 'અન્ય'), ('છું', 'અન્ય')]


# Rule Based Tagger- Marathi

In [None]:
# Marathi Stop Words Removal
marathi_stop_words = set([
    'आहे', 'मी', 'तो', 'ती', 'आपण', 'हे', 'त्याला', 'कडे', 'साठी', 'परंतु', 'आणि',
    'ते', 'असते', 'किंवा'
])

def remove_marathi_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in marathi_stop_words]
    return ' '.join(filtered_words)

# Example usage for Marathi stop word removal
marathi_text = "मी शाळेत जात आहे आणि तो पण येतो."
clean_marathi_text = remove_marathi_stop_words(marathi_text)
print("Clean Marathi Text:", clean_marathi_text)  # Output: "मी शाळेत जात आणि पण येतो."

# Marathi Stemming
def stem_marathi_word(word):
    # Basic example for Marathi stemming
    if word.endswith('णे'):
        return word[:-2]
    elif word.endswith('ी'):
        return word[:-1]
    elif word.endswith('आहे'):
        return word[:-4]
    return word

def stem_marathi_text(text):
    words = text.split()
    stemmed_words = [stem_marathi_word(word) for word in words]
    return ' '.join(stemmed_words)

# Example usage for Marathi stemming
marathi_stem_text = "करण्यात आले"
stemmed_marathi_text = stem_marathi_text(marathi_stem_text)
print("Stemmed Marathi Text:", stemmed_marathi_text)  # Output: "करण्यात आले"

# Marathi Lemmatization
marathi_lemma_dict = {
    'करणे': 'कर',
    'करले': 'कर',
    'सोडणे': 'सोड',
    'जाणे': 'जा',
    'गेला': 'जा'
}

def lemmatize_marathi_word(word):
    return marathi_lemma_dict.get(word, word)

def lemmatize_marathi_text(text):
    words = text.split()
    lemmatized_words = [lemmatize_marathi_word(word) for word in words]
    return ' '.join(lemmatized_words)

# Example usage for Marathi lemmatization
marathi_lemma_text = "करणे करतो केले जात आहे"
lemmatized_marathi_text = lemmatize_marathi_text(marathi_lemma_text)
print("Lemmatized Marathi Text:", lemmatized_marathi_text)  # Output: "कर करतो केले जात आहे"

# Marathi POS Tagging
marathi_pos_tags = {
    'मी': 'सर्वनाम',
    'शाळा': 'संज्ञा',
    'जात': 'क्रिया',
    'आहे': 'क्रिया',
}

def pos_tag_marathi(text):
    words = text.split()
    tagged_words = [(word, marathi_pos_tags.get(word, 'अन्य')) for word in words]
    return tagged_words

# Example usage for Marathi POS tagging
marathi_pos_text = "मी शाळेत जात आहे"
tagged_marathi_text = pos_tag_marathi(marathi_pos_text)
print("Tagged Marathi Text:", tagged_marathi_text)  # Output: [('मी', 'सर्वनाम'), ('शाळेत', 'अन्य'), ('जात', 'क्रिया'), ('आहे', 'क्रिया')]


Clean Marathi Text: शाळेत जात पण येतो.
Stemmed Marathi Text: करण्यात आले
Lemmatized Marathi Text: कर करतो केले जात आहे
Tagged Marathi Text: [('मी', 'सर्वनाम'), ('शाळेत', 'अन्य'), ('जात', 'क्रिया'), ('आहे', 'क्रिया')]


# Stochastic Tagger- Hindi

In [None]:
from collections import defaultdict
import numpy as np

# Annotated corpus with multiple possible tags for each word
corpus = [
    ("गणेश चतुर्थी एक महत्वपूर्ण त्योहार है", ["संज्ञा", "संज्ञा", "सर्वनाम", "विशेषण", "संज्ञा", "क्रिया"]),
    ("गणपति बप्पा मोरया की पूजा धूमधाम से होती है", ["संज्ञा", "संज्ञा", "संज्ञा", "संबंधवाचक", "संज्ञा", "विशेषण", "संबंधवाचक", "क्रिया", "है"]),
    ("इस अवसर पर घरों में गणपति की प्रतिमा स्थापित की जाती है", ["सर्वनाम", "संज्ञा", "संबंधवाचक", "संज्ञा", "संबंधवाचक", "संज्ञा", "संबंधवाचक", "संज्ञा", "क्रिया", "संबंधवाचक", "क्रिया", "है"]),
    ("लोग रंग-बिरंगे कपड़े पहनकर गणेश जी की आराधना करते हैं", ["संज्ञा", "विशेषण", "संज्ञा", "कृया", "संज्ञा", "संबंधवाचक", "संज्ञा", "क्रिया", "हैं"]),
    ("गणेश विसर्जन के दिन उत्सव का समापन होता है", ["संज्ञा", "संज्ञा", "संबंधवाचक", "संज्ञा", "संज्ञा", "संबंधवाचक", "संज्ञा", "क्रिया", "है"])
]

# Initialize counts
tag_counts = defaultdict(int)
word_tag_counts = defaultdict(lambda: defaultdict(int))
transition_counts = defaultdict(lambda: defaultdict(int))

# Count occurrences
for sentence, tags in corpus:
    words = sentence.split()
    for word, tag in zip(words, tags):
        tag_counts[tag] += 1
        word_tag_counts[word][tag] += 1
    for i in range(len(tags) - 1):
        transition_counts[tags[i]][tags[i + 1]] += 1

# Calculate emission probabilities
emission_prob = defaultdict(lambda: defaultdict(float))
for word, tags in word_tag_counts.items():
    for tag, count in tags.items():
        emission_prob[word][tag] = count / tag_counts[tag]

# Calculate transition probabilities
transition_prob = defaultdict(lambda: defaultdict(float))
for tag1, next_tags in transition_counts.items():
    total = sum(next_tags.values())
    for tag2, count in next_tags.items():
        transition_prob[tag1][tag2] = count / total

# Handling unseen words and tags
def get_emission_prob(word, tag):
    return emission_prob[word].get(tag, 1e-6)  # Smoothing for unseen words

def get_transition_prob(tag1, tag2):
    return transition_prob[tag1].get(tag2, 1e-6)  # Smoothing for unseen transitions

def print_emission_matrix():
    print("Emission Matrix:")
    words = list(word_tag_counts.keys())
    tags = list(tag_counts.keys())
    print("Word/Tag", "\t".join(tags))
    for word in words:
        row = [f"{emission_prob[word].get(tag, 0):.4f}" for tag in tags]
        print(f"{word}\t" + "\t".join(row))

def print_transition_matrix():
    print("\nTransition Matrix:")
    tags = list(tag_counts.keys())
    print("From/To", "\t".join(tags))
    for tag1 in tags:
        row = [f"{transition_prob[tag1].get(tag2, 0):.4f}" for tag2 in tags]
        print(f"{tag1}\t" + "\t".join(row))

def viterbi(sentence):
    words = sentence.split()
    n = len(words)
    tags = list(tag_counts.keys())

    # Initialization
    V = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))

    # Initial probabilities
    for tag in tags:
        V[0][tag] = np.log(get_emission_prob(words[0], tag))

    # Dynamic programming
    for t in range(1, n):
        for tag in tags:
            max_prob, best_tag = max(
                (V[t - 1][prev_tag] + np.log(get_transition_prob(prev_tag, tag)) + np.log(get_emission_prob(words[t], tag)), prev_tag)
                for prev_tag in tags
            )
            V[t][tag] = max_prob
            backpointer[t][tag] = best_tag

    # Backtrack to find the best path
    best_path_prob, best_last_tag = max((V[n - 1][tag], tag) for tag in tags)
    best_path = [best_last_tag]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])

    return best_path, best_path_prob

# Print matrices
print_emission_matrix()
print_transition_matrix()

# Test with a new sentence
test_sentence = "गणेश चतुर्थी का त्योहार है"
predicted_tags, prob = viterbi(test_sentence)
print("\nTest Sentence:", test_sentence)
print("\nPredicted Tags:", predicted_tags)
print("Log Probability of the Best Path:", prob)


Emission Matrix:
Word/Tag संज्ञा	सर्वनाम	विशेषण	क्रिया	संबंधवाचक	है	कृया	हैं
गणेश	0.1500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
चतुर्थी	0.0500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
एक	0.0000	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
महत्वपूर्ण	0.0000	0.0000	0.3333	0.0000	0.0000	0.0000	0.0000	0.0000
त्योहार	0.0500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
है	0.0000	0.0000	0.0000	0.1667	0.0000	1.0000	0.0000	0.0000
गणपति	0.1000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
बप्पा	0.0500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
मोरया	0.0500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
की	0.0500	0.0000	0.0000	0.0000	0.3333	0.0000	0.0000	0.0000
पूजा	0.0500	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
धूमधाम	0.0000	0.0000	0.3333	0.0000	0.0000	0.0000	0.0000	0.0000
से	0.0000	0.0000	0.0000	0.0000	0.1111	0.0000	0.0000	0.0000
होती	0.0000	0.0000	0.0000	0.1667	0.0000	0.0000	0.0000	0.0000
इस	0.0000	0.5000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000
अ

# Stochastic Tagger- Gujarati

In [None]:
from collections import defaultdict
import numpy as np

# Annotated corpus for Gujarati
gujarati_corpus = [
    ("ગણેશ ચતુર્થી એક મહત્વપૂર્ણ તહેવાર છે", ["સંજ્ઞા", "સંજ્ઞા", "સર્વનામ", "વિશેષણ", "સંજ્ઞા", "ક્રિયા"]),
    ("ગણપતિ બપ્પા મોરયા ની પૂજા ધૂમધામથી થાય છે", ["સંજ્ઞા", "સંજ્ઞા", "સંજ્ઞા", "સંબંધવાચક", "સંજ્ઞા", "વિશેષણ", "સંબંધવાચક", "ક્રિયા", "છે"]),
]

def build_gujarati_model(corpus):
    tag_counts = defaultdict(int)
    word_tag_counts = defaultdict(lambda: defaultdict(int))
    transition_counts = defaultdict(lambda: defaultdict(int))

    for sentence, tags in corpus:
        words = sentence.split()
        for word, tag in zip(words, tags):
            tag_counts[tag] += 1
            word_tag_counts[word][tag] += 1
        for i in range(len(tags) - 1):
            transition_counts[tags[i]][tags[i + 1]] += 1

    emission_prob = defaultdict(lambda: defaultdict(float))
    for word, tags in word_tag_counts.items():
        for tag, count in tags.items():
            emission_prob[word][tag] = count / tag_counts[tag]

    transition_prob = defaultdict(lambda: defaultdict(float))
    for tag1, next_tags in transition_counts.items():
        total = sum(next_tags.values())
        for tag2, count in next_tags.items():
            transition_prob[tag1][tag2] = count / total

    return emission_prob, transition_prob, tag_counts

def print_matrices(emission_prob, transition_prob, tag_counts):
    print("\nEmission Probability Matrix (Gujarati):")
    print("{:<20} {}".format("Word/Tag", " | ".join(tag_counts.keys())))
    for word in emission_prob:
        row = [f"{emission_prob[word].get(tag, 0):.3f}" for tag in tag_counts.keys()]
        print(f"{word:<20} {' | '.join(row)}")

    print("\nTransition Probability Matrix (Gujarati):")
    print("{:<20} {}".format("Tag1/Tag2", " | ".join(tag_counts.keys())))
    for tag1 in transition_prob:
        row = [f"{transition_prob[tag1].get(tag2, 0):.3f}" for tag2 in tag_counts.keys()]
        print(f"{tag1:<20} {' | '.join(row)}")

def get_gujarati_emission_prob(emission_prob, word, tag):
    return emission_prob[word].get(tag, 1e-6)

def get_gujarati_transition_prob(transition_prob, tag1, tag2):
    return transition_prob[tag1].get(tag2, 1e-6)

def viterbi_gujarati(emission_prob, transition_prob, tag_counts, sentence):
    words = sentence.split()
    n = len(words)
    tags = list(tag_counts.keys())

    V = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))

    for tag in tags:
        V[0][tag] = np.log(get_gujarati_emission_prob(emission_prob, words[0], tag))

    for t in range(1, n):
        for tag in tags:
            max_prob, best_tag = max(
                (V[t - 1][prev_tag] + np.log(get_gujarati_transition_prob(transition_prob, prev_tag, tag)) + np.log(get_gujarati_emission_prob(emission_prob, words[t], tag)), prev_tag)
                for prev_tag in tags
            )
            V[t][tag] = max_prob
            backpointer[t][tag] = best_tag

    best_path_prob, best_last_tag = max((V[n - 1][tag], tag) for tag in tags)
    best_path = [best_last_tag]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])

    return best_path, best_path_prob

# Build model for Gujarati
gujarati_emission_prob, gujarati_transition_prob, gujarati_tag_counts = build_gujarati_model(gujarati_corpus)

# Print matrices for Gujarati
print_matrices(gujarati_emission_prob, gujarati_transition_prob, gujarati_tag_counts)

# Test with a new sentence for Gujarati
gujarati_test_sentence = "ગણેશ ચતુર્થી નો તહેવાર છે"
gujarati_predicted_tags, gujarati_prob = viterbi_gujarati(gujarati_emission_prob, gujarati_transition_prob, gujarati_tag_counts, gujarati_test_sentence)
print("\nGujarati Test Sentence:", gujarati_test_sentence)
print("Predicted Tags:", gujarati_predicted_tags)
print("Log Probability of the Best Path:", gujarati_prob)



Emission Probability Matrix (Gujarati):
Word/Tag             સંજ્ઞા | સર્વનામ | વિશેષણ | ક્રિયા | સંબંધવાચક
ગણેશ                 0.143 | 0.000 | 0.000 | 0.000 | 0.000
ચતુર્થી              0.143 | 0.000 | 0.000 | 0.000 | 0.000
એક                   0.000 | 1.000 | 0.000 | 0.000 | 0.000
મહત્વપૂર્ણ           0.000 | 0.000 | 0.500 | 0.000 | 0.000
તહેવાર               0.143 | 0.000 | 0.000 | 0.000 | 0.000
છે                   0.000 | 0.000 | 0.000 | 1.000 | 0.000
ગણપતિ                0.143 | 0.000 | 0.000 | 0.000 | 0.000
બપ્પા                0.143 | 0.000 | 0.000 | 0.000 | 0.000
મોરયા                0.143 | 0.000 | 0.000 | 0.000 | 0.000
ની                   0.000 | 0.000 | 0.000 | 0.000 | 0.500
પૂજા                 0.143 | 0.000 | 0.000 | 0.000 | 0.000
ધૂમધામથી             0.000 | 0.000 | 0.500 | 0.000 | 0.000
થાય                  0.000 | 0.000 | 0.000 | 0.000 | 0.500

Transition Probability Matrix (Gujarati):
Tag1/Tag2            સંજ્ઞા | સર્વનામ | વિશેષણ | ક્રિયા | સંબંધવાચક
સંજ્ઞા       

# Stochastic Tagger- Marathi

In [None]:
from collections import defaultdict
import numpy as np

# Annotated corpus for Marathi
marathi_corpus = [
    ("गणेश चतुर्थी एक महत्वाचा सण आहे", ["संज्ञा", "संज्ञा", "सर्वनाम", "विशेषण", "संज्ञा", "क्रिया"]),
    ("गणपती बप्पा मोरया की पूजा धूमधाम से होते आहे", ["संज्ञा", "संज्ञा", "संज्ञा", "संबंधवाचक", "संज्ञा", "विशेषण", "संबंधवाचक", "क्रिया", "आहे"]),
]

def build_marathi_model(corpus):
    tag_counts = defaultdict(int)
    word_tag_counts = defaultdict(lambda: defaultdict(int))
    transition_counts = defaultdict(lambda: defaultdict(int))

    for sentence, tags in corpus:
        words = sentence.split()
        for word, tag in zip(words, tags):
            tag_counts[tag] += 1
            word_tag_counts[word][tag] += 1
        for i in range(len(tags) - 1):
            transition_counts[tags[i]][tags[i + 1]] += 1

    emission_prob = defaultdict(lambda: defaultdict(float))
    for word, tags in word_tag_counts.items():
        for tag, count in tags.items():
            emission_prob[word][tag] = count / tag_counts[tag]

    transition_prob = defaultdict(lambda: defaultdict(float))
    for tag1, next_tags in transition_counts.items():
        total = sum(next_tags.values())
        for tag2, count in next_tags.items():
            transition_prob[tag1][tag2] = count / total

    return emission_prob, transition_prob, tag_counts

def print_matrices(emission_prob, transition_prob, tag_counts):
    print("\nEmission Probability Matrix (Marathi):")
    print("{:<20} {}".format("Word/Tag", " | ".join(tag_counts.keys())))
    for word in emission_prob:
        row = [f"{emission_prob[word].get(tag, 0):.3f}" for tag in tag_counts.keys()]
        print(f"{word:<20} {' | '.join(row)}")

    print("\nTransition Probability Matrix (Marathi):")
    print("{:<20} {}".format("Tag1/Tag2", " | ".join(tag_counts.keys())))
    for tag1 in transition_prob:
        row = [f"{transition_prob[tag1].get(tag2, 0):.3f}" for tag2 in tag_counts.keys()]
        print(f"{tag1:<20} {' | '.join(row)}")

def get_marathi_emission_prob(emission_prob, word, tag):
    return emission_prob[word].get(tag, 1e-6)

def get_marathi_transition_prob(transition_prob, tag1, tag2):
    return transition_prob[tag1].get(tag2, 1e-6)

def viterbi_marathi(emission_prob, transition_prob, tag_counts, sentence):
    words = sentence.split()
    n = len(words)
    tags = list(tag_counts.keys())

    V = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))

    for tag in tags:
        V[0][tag] = np.log(get_marathi_emission_prob(emission_prob, words[0], tag))

    for t in range(1, n):
        for tag in tags:
            max_prob, best_tag = max(
                (V[t - 1][prev_tag] + np.log(get_marathi_transition_prob(transition_prob, prev_tag, tag)) + np.log(get_marathi_emission_prob(emission_prob, words[t], tag)), prev_tag)
                for prev_tag in tags
            )
            V[t][tag] = max_prob
            backpointer[t][tag] = best_tag

    best_path_prob, best_last_tag = max((V[n - 1][tag], tag) for tag in tags)
    best_path = [best_last_tag]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointer[t][best_path[0]])

    return best_path, best_path_prob

# Build model for Marathi
marathi_emission_prob, marathi_transition_prob, marathi_tag_counts = build_marathi_model(marathi_corpus)

# Print matrices for Marathi
print_matrices(marathi_emission_prob, marathi_transition_prob, marathi_tag_counts)

# Test with a new sentence for Marathi
marathi_test_sentence = "गणेश चतुर्थीचा सण आहे"
marathi_predicted_tags, marathi_prob = viterbi_marathi(marathi_emission_prob, marathi_transition_prob, marathi_tag_counts, marathi_test_sentence)
print("\nMarathi Test Sentence:", marathi_test_sentence)
print("Predicted Tags:", marathi_predicted_tags)
print("Log Probability of the Best Path:", marathi_prob)



Emission Probability Matrix (Marathi):
Word/Tag             संज्ञा | सर्वनाम | विशेषण | क्रिया | संबंधवाचक | आहे
गणेश                 0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
चतुर्थी              0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
एक                   0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000
महत्वाचा             0.000 | 0.000 | 0.500 | 0.000 | 0.000 | 0.000
सण                   0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
आहे                  0.000 | 0.000 | 0.000 | 0.500 | 0.000 | 1.000
गणपती                0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
बप्पा                0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
मोरया                0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
की                   0.000 | 0.000 | 0.000 | 0.000 | 0.500 | 0.000
पूजा                 0.143 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000
धूमधाम               0.000 | 0.000 | 0.500 | 0.000 | 0.000 | 0.000
से                   0.000 | 0.000 | 0.000 | 0.000 | 0.500 | 0.000
होते           

# **Chunking**

# English

In [None]:
import nltk
from nltk.chunk import RegexpParser
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Generate example sentences
sentences = [
    "The Big Apple is a nickname for New York City.",  # Sentence with Big Apple and New York
    "The Empire State Building is in New York.",  # Sentence with New York
    "A red apple fell from the tall tree.",  # Simple noun phrase
    "She bought a lovely bouquet of flowers for the party.",  # Complex noun phrase
    "John and Mary went to the market to buy fresh vegetables.",  # Verb phrase with multiple nouns
    "The cat sat on the mat in the cozy living room.",  # Prepositional phrase
    "The fast train arrived early at the station.",  # Adjective-noun phrase
    "Children love playing in the sunny park during summer.",  # Noun phrase with prepositional phrase
    "The chef prepared a delicious meal for the guests.",  # Verb phrase with noun phrase
    "At the break of dawn, the city starts to wake up.",  # Complex sentence with multiple chunks
    "She went to the local bakery for some fresh bread.",  # Prepositional phrase and noun phrase
    "They enjoyed a quiet evening by the lake.",  # Noun phrase and prepositional phrase
    "The book on the shelf was covered in dust.",  # Noun phrase and prepositional phrase
    "It was a bright, sunny day in the countryside.",  # Adjective-noun phrase
    "The museum exhibits ancient artifacts from various cultures.",  # Complex noun phrase
    "Under the old oak tree, they found a hidden treasure.",  # Prepositional phrase and noun phrase
    "She wrote a heartfelt letter to her dear friend.",  # Verb phrase and noun phrase
    "The car parked near the restaurant was red.",  # Noun phrase with prepositional phrase
    "The children were playing joyfully in the playground.",  # Verb phrase and noun phrase
    "At the crack of dawn, the birds began to sing.",  # Prepositional phrase and verb phrase
]

# Define chunk pattern
chunk_pattern = '''
    NP: {<DT>?<JJ>*<NN.*>+}
    VP: {<VB.*><NP|PP>*}
    PP: {<IN><NP>}
    '''
print("chunk_pattern:",chunk_pattern)

# Initialize chunk parser
chunk_parser = RegexpParser(chunk_pattern)

# Process and chunk sentences
for sentence in sentences:
    # Tokenize and POS tagging
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)

    # Perform chunking
    chunks = chunk_parser.parse(tagged)

    # Print sentence and chunking result
    print(f"Sentence: {sentence}")
    print("Chunks:")
    print(chunks)
    print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


chunk_pattern: 
    NP: {<DT>?<JJ>*<NN.*>+}
    VP: {<VB.*><NP|PP>*}
    PP: {<IN><NP>}
    
Sentence: The Big Apple is a nickname for New York City.
Chunks:
(S
  (NP The/DT Big/NNP Apple/NNP)
  (VP is/VBZ (NP a/DT nickname/NN))
  (PP for/IN (NP New/NNP York/NNP City/NNP))
  ./.)

Sentence: The Empire State Building is in New York.
Chunks:
(S
  (NP The/DT Empire/NNP State/NNP Building/NNP)
  (VP is/VBZ)
  (PP in/IN (NP New/NNP York/NNP))
  ./.)

Sentence: A red apple fell from the tall tree.
Chunks:
(S
  (NP A/DT red/JJ apple/NN)
  (VP fell/VBD)
  (PP from/IN (NP the/DT tall/JJ tree/NN))
  ./.)

Sentence: She bought a lovely bouquet of flowers for the party.
Chunks:
(S
  She/PRP
  (VP bought/VBD (NP a/DT lovely/JJ bouquet/NN))
  (PP of/IN (NP flowers/NNS))
  (PP for/IN (NP the/DT party/NN))
  ./.)

Sentence: John and Mary went to the market to buy fresh vegetables.
Chunks:
(S
  (NP John/NNP)
  and/CC
  (NP Mary/NNP)
  (VP went/VBD)
  to/TO
  (NP the/DT market/NN)
  to/TO
  (VP buy/VB (

# Hindi

In [None]:
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

# Example sentences in Hindi
sentences = [
    "नयी दिल्ली भारत की राजधानी है।",  # New Delhi is the capital of India.
    "ताजमहल आगरा में स्थित है।",       # The Taj Mahal is located in Agra.
    "मैंने आज एक किताब पढ़ी।",         # I read a book today.
    "उसने एक सुंदर फूल खरीदी।",        # She bought a beautiful flower.
    "रात को चाँद चमक रहा था।",          # The moon was shining at night.
    "गाड़ी सड़क पर चल रही है।",          # The car is running on the road.
    "बच्चे खेल रहे हैं।",               # The children are playing.
    "यह एक शानदार अनुभव था।",          # It was a wonderful experience.
    "मैंने चाय पी और खाना खाया।",      # I drank tea and ate food.
    "सूरज सुबह उगता है।",              # The sun rises in the morning.
]

# Define a simple chunk pattern for Hindi (simplified and illustrative)
chunk_pattern = '''
    NP: {<NOUN|ADJ>*<NOUN>}
    VP: {<VERB><NP|PP>*}
    PP: {<PREP><NP>}
    '''

# Initialize chunk parser
chunk_parser = RegexpParser(chunk_pattern)

# Define a dummy POS tagger for Hindi (for demonstration purposes)
def pos_tag_hindi(tokens):
    # Example PoS tags; in practice, use a proper Hindi PoS tagger
    pos_tags = {
        "नयी": "ADJ", "दिल्ली": "NOUN", "भारत": "NOUN", "की": "ADJ", "राजधानी": "NOUN", "है": "VERB",
        "ताजमहल": "NOUN", "आगरा": "NOUN", "में": "PREP", "स्थित": "VERB", "है": "VERB",
        "मैंने": "PRON", "आज": "ADV", "एक": "NUM", "किताब": "NOUN", "पढ़ी": "VERB",
        "उसने": "PRON", "सुंदर": "ADJ", "फूल": "NOUN", "खरीदी": "VERB",
        "रात": "NOUN", "को": "PREP", "चाँद": "NOUN", "चमक": "VERB", "रहा": "VERB", "था": "VERB",
        "गाड़ी": "NOUN", "सड़क": "NOUN", "पर": "PREP", "चल": "VERB", "रही": "VERB", "है": "VERB",
        "बच्चे": "NOUN", "खेल": "VERB", "रहे": "VERB", "हैं": "VERB",
        "यह": "PRON", "एक": "NUM", "शानदार": "ADJ", "अनुभव": "NOUN", "था": "VERB",
        "मैंने": "PRON", "चाय": "NOUN", "पी": "VERB", "और": "CONJ", "खाना": "NOUN", "खाया": "VERB",
        "सूरज": "NOUN", "सुबह": "NOUN", "उगता": "VERB", "है": "VERB",
    }
    return [(token, pos_tags.get(token, 'UNKNOWN')) for token in tokens]

# Process and chunk sentences
for sentence in sentences:
    # Tokenize and POS tagging
    tokens = word_tokenize(sentence)
    tagged = pos_tag_hindi(tokens)

    # Perform chunking
    chunks = chunk_parser.parse(tagged)

    # Print sentence and chunking result
    print(f"Sentence: {sentence}")
    print("Chunks:")
    print(chunks)
    print()


Sentence: नयी दिल्ली भारत की राजधानी है।
Chunks:
(S
  (NP नयी/ADJ दिल्ली/NOUN भारत/NOUN की/ADJ राजधानी/NOUN)
  है।/UNKNOWN)

Sentence: ताजमहल आगरा में स्थित है।
Chunks:
(S (NP ताजमहल/NOUN आगरा/NOUN) में/PREP (VP स्थित/VERB) है।/UNKNOWN)

Sentence: मैंने आज एक किताब पढ़ी।
Chunks:
(S मैंने/PRON आज/ADV एक/NUM (NP किताब/NOUN) पढ़ी।/UNKNOWN)

Sentence: उसने एक सुंदर फूल खरीदी।
Chunks:
(S उसने/PRON एक/NUM (NP सुंदर/ADJ फूल/NOUN) खरीदी।/UNKNOWN)

Sentence: रात को चाँद चमक रहा था।
Chunks:
(S
  (NP रात/NOUN)
  (PP को/PREP (NP चाँद/NOUN))
  (VP चमक/VERB)
  (VP रहा/VERB)
  था।/UNKNOWN)

Sentence: गाड़ी सड़क पर चल रही है।
Chunks:
(S
  (NP गाड़ी/NOUN सड़क/NOUN)
  पर/PREP
  (VP चल/VERB)
  (VP रही/VERB)
  है।/UNKNOWN)

Sentence: बच्चे खेल रहे हैं।
Chunks:
(S (NP बच्चे/NOUN) (VP खेल/VERB) (VP रहे/VERB) हैं।/UNKNOWN)

Sentence: यह एक शानदार अनुभव था।
Chunks:
(S यह/PRON एक/NUM (NP शानदार/ADJ अनुभव/NOUN) था।/UNKNOWN)

Sentence: मैंने चाय पी और खाना खाया।
Chunks:
(S
  मैंने/PRON
  (NP चाय/NOUN)
  (VP पी/V

# Gujarati

In [None]:
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

# Example sentences in Gujarati
gujarati_sentences = [
    "અમદાવાદ ગુજરાતની રાજધાની છે।",  # Ahmedabad is the capital of Gujarat.
    "કેન્દ્રીય વિઝ્યુમાર્જન ઇકનેર એસેન્ટન્સ છે।",  # The Central Library is a valuable resource.
    "મેં આજે એક નવલકથાની વાંચી।",  # I read a novel today.
    "તે એક સુંદર ફૂલ ખરીદી હતી।",  # She bought a beautiful flower.
    "રાતે ચંદ્ર પ્રકાશીત હતો।",  # The moon was shining at night.
]

# Define a simple chunk pattern for Gujarati
chunk_pattern_gujarati = '''
    NP: {<NOUN|ADJ>*<NOUN>}
    VP: {<VERB><NP|PP>*}
    PP: {<PREP><NP>}
    '''

# Initialize chunk parser
chunk_parser_gujarati = RegexpParser(chunk_pattern_gujarati)

# Define a dummy POS tagger for Gujarati
def pos_tag_gujarati(tokens):
    pos_tags = {
        "અમદાવાદ": "NOUN", "ગુજરાત": "NOUN", "ની": "ADJ", "રાજધાની": "NOUN", "છે": "VERB",
        "કેન્દ્રીય": "ADJ", "વિઝ્યુમાર્જન": "NOUN", "ઇકનેર": "NOUN", "એસેન્ટન્સ": "NOUN", "છે": "VERB",
        "મેં": "PRON", "આજે": "ADV", "એક": "NUM", "નવલકથાની": "NOUN", "વાંચી": "VERB",
        "તે": "PRON", "સુંદર": "ADJ", "ફૂલ": "NOUN", "ખરીદી": "VERB", "હતી": "VERB",
        "રાતે": "NOUN", "ચંદ્ર": "NOUN", "પ્રકાશીત": "VERB", "હતો": "VERB",
    }
    return [(token, pos_tags.get(token, 'UNKNOWN')) for token in tokens]

# Process and chunk sentences in Gujarati
for sentence in gujarati_sentences:
    tokens = word_tokenize(sentence)
    tagged = pos_tag_gujarati(tokens)
    chunks = chunk_parser_gujarati.parse(tagged)

    print(f"Sentence: {sentence}")
    print("Chunks:")
    print(chunks)
    print()


Sentence: અમદાવાદ ગુજરાતની રાજધાની છે।
Chunks:
(S (NP અમદાવાદ/NOUN) ગુજરાતની/UNKNOWN (NP રાજધાની/NOUN) છે।/UNKNOWN)

Sentence: કેન્દ્રીય વિઝ્યુમાર્જન ઇકનેર એસેન્ટન્સ છે।
Chunks:
(S
  (NP કેન્દ્રીય/ADJ વિઝ્યુમાર્જન/NOUN ઇકનેર/NOUN એસેન્ટન્સ/NOUN)
  છે।/UNKNOWN)

Sentence: મેં આજે એક નવલકથાની વાંચી।
Chunks:
(S મેં/PRON આજે/ADV એક/NUM (NP નવલકથાની/NOUN) વાંચી।/UNKNOWN)

Sentence: તે એક સુંદર ફૂલ ખરીદી હતી।
Chunks:
(S
  તે/PRON
  એક/NUM
  (NP સુંદર/ADJ ફૂલ/NOUN)
  (VP ખરીદી/VERB)
  હતી।/UNKNOWN)

Sentence: રાતે ચંદ્ર પ્રકાશીત હતો।
Chunks:
(S (NP રાતે/NOUN ચંદ્ર/NOUN) (VP પ્રકાશીત/VERB) હતો।/UNKNOWN)



# Marathi

In [None]:
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

# Example sentences in Marathi
marathi_sentences = [
    "मुंबई महाराष्ट्राची राजधानी आहे।",  # Mumbai is the capital of Maharashtra.
    "ताजमहल आग्र्यात आहे।",                # The Taj Mahal is in Agra.
    "मी आज एक पुस्तक वाचले।",              # I read a book today.
    "तिने एक सुंदर फुल खरेदी केले।",       # She bought a beautiful flower.
    "रात्री चंद्र प्रकाशमान होता।",          # The moon was shining at night.
]

# Define a simple chunk pattern for Marathi
chunk_pattern_marathi = '''
    NP: {<NOUN|ADJ>*<NOUN>}
    VP: {<VERB><NP|PP>*}
    PP: {<PREP><NP>}
    '''

# Initialize chunk parser
chunk_parser_marathi = RegexpParser(chunk_pattern_marathi)

# Define a dummy POS tagger for Marathi
def pos_tag_marathi(tokens):
    pos_tags = {
        "मुंबई": "NOUN", "महाराष्ट्राची": "ADJ", "राजधानी": "NOUN", "आहे": "VERB",
        "ताजमहल": "NOUN", "आग्र्यात": "NOUN", "आहे": "VERB",
        "मी": "PRON", "आज": "ADV", "एक": "NUM", "पुस्तक": "NOUN", "वाचले": "VERB",
        "तिने": "PRON", "सुंदर": "ADJ", "फुल": "NOUN", "खरेदी": "VERB", "केले": "VERB",
        "रात्री": "NOUN", "चंद्र": "NOUN", "प्रकाशमान": "VERB", "होता": "VERB",
    }
    return [(token, pos_tags.get(token, 'UNKNOWN')) for token in tokens]

# Process and chunk sentences in Marathi
for sentence in marathi_sentences:
    tokens = word_tokenize(sentence)
    tagged = pos_tag_marathi(tokens)
    chunks = chunk_parser_marathi.parse(tagged)

    print(f"Sentence: {sentence}")
    print("Chunks:")
    print(chunks)
    print()


Sentence: मुंबई महाराष्ट्राची राजधानी आहे।
Chunks:
(S (NP मुंबई/NOUN महाराष्ट्राची/ADJ राजधानी/NOUN) आहे।/UNKNOWN)

Sentence: ताजमहल आग्र्यात आहे।
Chunks:
(S (NP ताजमहल/NOUN आग्र्यात/NOUN) आहे।/UNKNOWN)

Sentence: मी आज एक पुस्तक वाचले।
Chunks:
(S मी/PRON आज/ADV एक/NUM (NP पुस्तक/NOUN) वाचले।/UNKNOWN)

Sentence: तिने एक सुंदर फुल खरेदी केले।
Chunks:
(S
  तिने/PRON
  एक/NUM
  (NP सुंदर/ADJ फुल/NOUN)
  (VP खरेदी/VERB)
  केले।/UNKNOWN)

Sentence: रात्री चंद्र प्रकाशमान होता।
Chunks:
(S (NP रात्री/NOUN चंद्र/NOUN) (VP प्रकाशमान/VERB) होता।/UNKNOWN)



# **NER**

# English

In [None]:
# Install spaCy and download the English model
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Custom function to perform NER
def perform_ner(corpus):
    results = {}
    for text in corpus:
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        results[text] = entities
    return results

# Define NER categories
ner_categories = {
    "PERSON": "Names of people.",
    "ORG": "Organizations or companies.",
    "GPE": "Geopolitical entities, such as countries or cities.",
    "LOC": "Locations that are not geopolitical, like mountains or bodies of water.",
    "FAC": "Facilities, such as buildings, airports, highways, etc."
}

# Function to print NER categories
def print_ner_categories(categories):
    print("Named Entity Recognition (NER) Categories:\n")
    for category, description in categories.items():
        print(f"{category}: {description}")

# Call the function to print categories
print_ner_categories(ner_categories)

# Create a function to generate a sample corpus
def create_corpus():
    return [
        "Barack Obama was the 44th President of the United States.",
        "Apple Inc. is looking to buy a startup in the UK.",
        "Mount Everest is the highest mountain in the world.",
        "The Eiffel Tower is located in Paris.",
        "NASA launched a new satellite last week."
    ]

# Create the new corpus
corpus = create_corpus()

# Perform NER
ner_results = perform_ner(corpus)

# Custom function to display results
def display_results(ner_results):
    for text, entities in ner_results.items():
        print(f"Text: {text}")
        print("Entities:")
        for entity in entities:
            print(f" - {entity[0]}: {entity[1]}")
        print("\n")

# Display the results
display_results(ner_results)


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Named Entity Recognition (NER) Categories:

PERSON: Names of people.
ORG: Organizations or companies.
GPE: Geopolitical entities, such as countries or cities.
LOC: Locations that are not geopolitical, like mountains or bodies of water.
FAC: Facilities, such as buildings, airports, highways, etc.
Text: Barack Obama was the 44th President of

# Hindi

In [None]:
import re

# Predefined list of entities (for demonstration)
KNOWN_PERSONS = ["नरेंद्र मोदी", "महात्मा गांधी", "सचिन तेंदुलकर"]
KNOWN_LOCATIONS = ["भारत", "बांग्लादेश", "दिल्ली", "मुंबई"]
KNOWN_ORGANIZATIONS = ["बीजेपी", "कांग्रेस"]

# Sample Hindi text
text = ("नरेंद्र मोदी भारत के प्रधानमंत्री हैं। "
        "उन्होंने 2022 में बांग्लादेश का दौरा किया। "
        "यह कार्यक्रम दिल्ली में हुआ।")

# Stochastic NER function
def stochastic_ner(text):
    entities = []

    # Match predefined persons
    for person in KNOWN_PERSONS:
        if person in text:
            entities.append({
                'entity': person,
                'type': 'PERSON'
            })

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text:
            entities.append({
                'entity': location,
                'type': 'LOCATION'
            })

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text:
            entities.append({
                'entity': org,
                'type': 'ORGANIZATION'
            })

    # Regex-based pattern for detecting years
    years = re.findall(r'\b\d{4}\b', text)
    for year in years:
        entities.append({
            'entity': year,
            'type': 'DATE'
        })

    return entities

# Perform stochastic NER
ner_results = stochastic_ner(text)

# Print recognized entities
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")

Entity: नरेंद्र मोदी, Type: PERSON
Entity: भारत, Type: LOCATION
Entity: बांग्लादेश, Type: LOCATION
Entity: दिल्ली, Type: LOCATION
Entity: 2022, Type: DATE


## Gujarati

In [None]:
import re

# Predefined list of entities (for demonstration)
KNOWN_PERSONS = ["નરેન્દ્ર મોદી", "મહાત્મા ગાંધી", "સચિન તેન્ડુલકર"]
KNOWN_LOCATIONS = ["ભારત", "બાંગ્લાદેશ", "દિલ્હી", "મુંબઇ"]
KNOWN_ORGANIZATIONS = ["બીજેપી", "કાંગ્રસ"]

# Sample Gujarati text
text = ("નરેન્દ્ર મોદી ભારતના પ્રધાનમંત્રી છે. "
        "તેમણે 2022 માં બાંગ્લાદેશની મુલાકાત લીધી. "
        "આ કાર્યક્રમ દિલ્હીમાં થયો હતો.")

# Stochastic NER function
def stochastic_ner(text):
    entities = []

    # Match predefined persons
    for person in KNOWN_PERSONS:
        if person in text:
            entities.append({
                'entity': person,
                'type': 'PERSON'
            })

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text:
            entities.append({
                'entity': location,
                'type': 'LOCATION'
            })

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text:
            entities.append({
                'entity': org,
                'type': 'ORGANIZATION'
            })

    # Regex-based pattern for detecting years
    years = re.findall(r'\b\d{4}\b', text)
    for year in years:
        entities.append({
            'entity': year,
            'type': 'DATE'
        })

    return entities

# Perform stochastic NER
ner_results = stochastic_ner(text)

# Print recognized entities
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")


Entity: નરેન્દ્ર મોદી, Type: PERSON
Entity: ભારત, Type: LOCATION
Entity: બાંગ્લાદેશ, Type: LOCATION
Entity: દિલ્હી, Type: LOCATION
Entity: 2022, Type: DATE


## Marathi

In [None]:
import re

# Predefined list of entities (for demonstration)
KNOWN_PERSONS = ["नरेंद्र मोदी", "महात्मा गांधी", "सचिन तेंडुलकर"]
KNOWN_LOCATIONS = ["भारत", "बांग्लादेश", "दिल्ली", "मुंबई"]
KNOWN_ORGANIZATIONS = ["भाजप", "काँग्रेस"]

# Sample Marathi text
text = ("नरेंद्र मोदी भारताचे पंतप्रधान आहेत. "
        "त्यांनी 2022 मध्ये बांग्लादेशाची यात्रा केली. "
        "हा कार्यक्रम दिल्लीमध्ये झाला.")

# Stochastic NER function
def stochastic_ner(text):
    entities = []

    # Match predefined persons
    for person in KNOWN_PERSONS:
        if person in text:
            entities.append({
                'entity': person,
                'type': 'PERSON'
            })

    # Match predefined locations
    for location in KNOWN_LOCATIONS:
        if location in text:
            entities.append({
                'entity': location,
                'type': 'LOCATION'
            })

    # Match predefined organizations
    for org in KNOWN_ORGANIZATIONS:
        if org in text:
            entities.append({
                'entity': org,
                'type': 'ORGANIZATION'
            })

    # Regex-based pattern for detecting years
    years = re.findall(r'\b\d{4}\b', text)
    for year in years:
        entities.append({
            'entity': year,
            'type': 'DATE'
        })

    return entities

# Perform stochastic NER
ner_results = stochastic_ner(text)

# Print recognized entities
for entity in ner_results:
    print(f"Entity: {entity['entity']}, Type: {entity['type']}")


Entity: नरेंद्र मोदी, Type: PERSON
Entity: भारत, Type: LOCATION
Entity: बांग्लादेश, Type: LOCATION
Entity: दिल्ली, Type: LOCATION
Entity: 2022, Type: DATE
