<a href="https://colab.research.google.com/github/siddhesh1503/NLP/blob/main/NLP_EXP_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Chunking using NLTK (English)**

In [None]:
!pip install nltk --quiet

import nltk
from nltk import pos_tag, word_tokenize, RegexpParser

# Downloads (safe for new NLTK)
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)

# Improved Grammar
grammar = r"""
  NP:   {<DT>?<JJ.*>*<NN.*>+}        # Noun Phrase
  PP:   {<IN><NP>}                   # Prepositional Phrase
  VP:   {<VB.*><NP|PP|CLAUSE|ADJP>*} # Verb Phrase
  ADJP: {<JJ.*><RB.*>*}              # Adjective Phrase
"""

chunk_parser = RegexpParser(grammar)

# Loop for dynamic input
while True:
    sentence = input("\n[NLTK] Enter a sentence (or 'exit' to quit): ")
    if sentence.lower() == "exit":
        break

    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    tree = chunk_parser.parse(tagged)

    print("\nPOS Tags:", tagged)
    print("\nChunk Tree (pretty print):")
    tree.pretty_print()   # nicely indented tree

    print("\nBracketed Tree:")
    print(tree)           # S-expression style



[NLTK] Enter a sentence (or 'exit' to quit): The boy is playing

POS Tags: [('The', 'DT'), ('boy', 'NN'), ('is', 'VBZ'), ('playing', 'VBG')]

Chunk Tree (pretty print):
             S                      
         ____|________________       
        NP          VP        VP    
   _____|____       |         |      
The/DT     boy/NN is/VBZ playing/VBG


Bracketed Tree:
(S (NP The/DT boy/NN) (VP is/VBZ) (VP playing/VBG))

[NLTK] Enter a sentence (or 'exit' to quit): exit


# **Chunking using SpaCy (English)**

In [None]:
# Install packages
!pip install spacy benepar torch --quiet
!python -m spacy download en_core_web_sm > /dev/null

import spacy
import benepar

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Download benepar model (if not already)
benepar.download("benepar_en3")

# Add benepar to the pipeline
if "benepar" not in nlp.pipe_names:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

def spacy_chunking(sentence):
    doc = nlp(sentence)
    sent = list(doc.sents)[0]  # first sentence

    # POS tags (already Penn-style)
    tags = [(token.text, token.tag_) for token in sent]

    # Use benepar's constituency parse
    tree = sent._.parse_string   # <-- safer than parse_tree, always works

    # Convert to NLTK Tree for pretty printing
    from nltk import Tree
    tree_obj = Tree.fromstring(tree)

    print("\nPOS Tags:", tags)
    print("\nChunk Tree (pretty print):")
    tree_obj.pretty_print()
    print("\nBracketed Tree:")
    print(tree_obj)


# Interactive loop
while True:
    sentence = input("\n[spaCy + Benepar] Enter a sentence (or 'exit' to quit): ")
    if sentence.lower() == "exit":
        break
    spacy_chunking(sentence)


[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!



[spaCy + Benepar] Enter a sentence (or 'exit' to quit): The boy is smart


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



POS Tags: [('The', 'DT'), ('boy', 'NN'), ('is', 'VBZ'), ('smart', 'JJ')]

Chunk Tree (pretty print):
             S               
      _______|_______         
     |               VP      
     |            ___|____    
     NP          |       ADJP
  ___|___        |        |   
 DT      NN     VBZ       JJ 
 |       |       |        |   
The     boy      is     smart


Bracketed Tree:
(S (NP (DT The) (NN boy)) (VP (VBZ is) (ADJP (JJ smart))))

[spaCy + Benepar] Enter a sentence (or 'exit' to quit): the boy is playing

POS Tags: [('the', 'DT'), ('boy', 'NN'), ('is', 'VBZ'), ('playing', 'VBG')]

Chunk Tree (pretty print):
             S                 
      _______|_______           
     |               VP        
     |            ___|_____     
     NP          |         VP  
  ___|___        |         |    
 DT      NN     VBZ       VBG  
 |       |       |         |    
the     boy      is     playing


Bracketed Tree:
(S (NP (DT the) (NN boy)) (VP (VBZ is) (VP (VBG playing)

# **Chunking using STANZA (English)**

In [None]:
# Install requirements
!pip install stanza benepar torch --quiet

import stanza
import benepar
from nltk import Tree
import logging

# 🔹 Disable Stanza logging
logging.getLogger("stanza").setLevel(logging.ERROR)

# Download English model (silent)
stanza.download("en", verbose=False)

# Initialize Stanza pipeline (tokenize, pos, constituency)
nlp = stanza.Pipeline("en", processors="tokenize,pos,constituency", verbose=False)

# Download benepar model (silent)
benepar.download("benepar_en3")

def stanza_chunking(sentence):
    doc = nlp(sentence)
    sent = doc.sentences[0]

    # POS tags
    tags = [(word.text, word.xpos) for word in sent.words]

    # Constituency tree
    tree = sent.constituency
    tree_obj = Tree.fromstring(str(tree))

    print("\nPOS Tags:", tags)
    print("\nChunk Tree (pretty print):")
    tree_obj.pretty_print()
    print("\nBracketed Tree:")
    print(tree_obj)


# 🔹 Interactive loop
while True:
    sentence = input("\nEnter a sentence (or 'exit' to quit): ")
    if sentence.lower() == "exit":
        break
    stanza_chunking(sentence)


[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!



Enter a sentence (or 'exit' to quit): The boy is smart

POS Tags: [('The', 'DT'), ('boy', 'NN'), ('is', 'VBZ'), ('smart', 'JJ')]

Chunk Tree (pretty print):
            ROOT              
             |                 
             S                
      _______|________         
     |                VP      
     |             ___|____    
     NP           |       ADJP
  ___|___         |        |   
 DT      NN      VBZ       JJ 
 |       |        |        |   
The     boy       is     smart


Bracketed Tree:
(ROOT (S (NP (DT The) (NN boy)) (VP (VBZ is) (ADJP (JJ smart)))))

Enter a sentence (or 'exit' to quit): exit


# **Chunking using STANZA (Regional Language)**

In [None]:
!pip install stanza nltk --quiet

import stanza
import logging
from nltk import Tree

# 🔹 Disable Stanza logging
logging.getLogger("stanza").setLevel(logging.ERROR)

# Download models once (silent)
stanza.download("hi", verbose=False)
stanza.download("mr", verbose=False)

# Pipelines (with lemma for depparse)
pipelines = {
    "hi": stanza.Pipeline("hi", processors="tokenize,pos,lemma,depparse", verbose=False),
    "mr": stanza.Pipeline("mr", processors="tokenize,pos,lemma,depparse", verbose=False)
}

def stanza_dependency_tree(sentence, lang="hi"):
    nlp = pipelines[lang]
    doc = nlp(sentence)
    sent = doc.sentences[0]

    # Build children dict
    children = {word.id: [] for word in sent.words}
    root = None
    for word in sent.words:
        if word.head == 0:   # root
            root = word.id
        else:
            children[word.head].append(word.id)

    # Recursive tree builder
    def build_tree(node_id):
        word = sent.words[node_id-1]
        label = f"{word.text}/{word.upos}"
        if children[node_id]:
            return Tree(label, [build_tree(child) for child in children[node_id]])
        else:
            return label

    tree = build_tree(root)

    print(f"\n===== {lang.upper()} Sentence =====")
    print("Sentence:", sentence)

    print("\nPOS Tags:")
    for w in sent.words:
        print(f"{w.text}\t{w.upos}\t({w.xpos})")

    print("\nChunk Tree :")
    if isinstance(tree, Tree):
        tree.pretty_print()
    print("\nBracketed Tree:")
    print(tree)


# 🔹 Run for Hindi
stanza_dependency_tree("मुझे क्रिकेट खेलना पसंद है।", "hi")

# 🔹 Run for Marathi
stanza_dependency_tree("मला क्रिकेट खेळायला आवडते.", "mr")



===== HI Sentence =====
Sentence: मुझे क्रिकेट खेलना पसंद है।

POS Tags:
मुझे	PRON	(PRP)
क्रिकेट	PROPN	(NNP)
खेलना	VERB	(VM)
पसंद	ADJ	(JJ)
है	VERB	(VM)
।	PUNCT	(SYM)

Chunk Tree :
          है/VERB                       
     ________|___________________        
    |        |        |      खेलना/VERB 
    |        |        |          |       
मुझे/PRON पसंद/ADJ ।/PUNCT क्रिकेट/PROPN


Bracketed Tree:
(है/VERB मुझे/PRON (खेलना/VERB क्रिकेट/PROPN) पसंद/ADJ ।/PUNCT)

===== MR Sentence =====
Sentence: मला क्रिकेट खेळायला आवडते.

POS Tags:
मला	PRON	(None)
क्रिकेट	NOUN	(None)
खेळायला	VERB	(None)
आवडते	VERB	(None)
.	PUNCT	(None)

Chunk Tree :
         आवडते/VERB             
    _________|___________        
   |         |      खेळायला/VERB
   |         |           |       
मला/PRON  ./PUNCT   क्रिकेट/NOUN


Bracketed Tree:
(आवडते/VERB मला/PRON (खेळायला/VERB क्रिकेट/NOUN) ./PUNCT)
