<a href="https://colab.research.google.com/github/siddhesh1503/NLP/blob/main/NLP_EXP_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

POS Tagging using NLTK (English)

In [4]:
import nltk

# Download both punkt and punkt_tab to cover all NLTK versions
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

from nltk.tokenize import word_tokenize
from nltk import pos_tag

# POS tag groups
POS_GROUPS = {
    'Noun': ['NN', 'NNS', 'NNP', 'NNPS'],
    'Verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'Modal': ['MD'],
    'Preposition': ['IN'],
    'Determiner': ['DT']
}

def filter_pos(tags):
    return [(word, key) for word, tag in tags
            for key, tag_list in POS_GROUPS.items() if tag in tag_list]

def english_pos_tagger():
    sentence = input("Enter an English sentence: ").strip()
    if not sentence:
        print("Error: Empty input!")
        return

    tokens = word_tokenize(sentence)
    tags = pos_tag(tokens)
    filtered_tags = filter_pos(tags)

    print("\nFiltered POS Tags (Noun, Verb, Modal, Preposition, Determiner):")
    print(filtered_tags)

if __name__ == "__main__":
    english_pos_tagger()


Enter an English sentence: the boy can play

Filtered POS Tags (Noun, Verb, Modal, Preposition, Determiner):
[('the', 'Determiner'), ('boy', 'Noun'), ('can', 'Modal'), ('play', 'Verb')]


Regional Languages POS Tagging (Hindi & Marathi using STANZA)


In [7]:
import stanza

stanza.download('hi', processors='tokenize,pos', verbose=False)
stanza.download('mr', processors='tokenize,pos', verbose=False)

nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos', verbose=False)
nlp_mr = stanza.Pipeline('mr', processors='tokenize,pos', verbose=False)

# Functions
def pos_tag_stanza(text, lang):
    if lang=='hi':
        doc = nlp_hi(text)
    elif lang=='mr':
        doc = nlp_mr(text)
    else:
        return []
    tags=[]
    for sentence in doc.sentences:
        for word in sentence.words:
            tags.append((word.text, word.upos))
    return tags

def print_pos_tags(tags, language):
    print(f"\nPOS Tags ({language}):")
    print(f"{'Word':<15} {'POS Tag'}")
    print("-"*30)
    for word, tag in tags:
        print(f"{word:<15} {tag}")

# Example
hi_text = "मुझे क्रिकेट खेलना पसंद है।"
mr_text = "मला क्रिकेट खेळायला आवडते."

print_pos_tags(pos_tag_stanza(hi_text, 'hi'), 'Hindi')
print_pos_tags(pos_tag_stanza(mr_text, 'mr'), 'Marathi')



POS Tags (Hindi):
Word            POS Tag
------------------------------
मुझे            PRON
क्रिकेट         PROPN
खेलना           VERB
पसंद            ADJ
है              VERB
।               PUNCT

POS Tags (Marathi):
Word            POS Tag
------------------------------
मला             PRON
क्रिकेट         NOUN
खेळायला         VERB
आवडते           VERB
.               PUNCT
