In [16]:
import nltk

In [17]:
from nltk import word_tokenize,sent_tokenize

In [18]:
from nltk.corpus import stopwords

In [19]:
text = "Hello Sreenivasa. How are you? I hope you're learning NLP. This is nice!"

sentences = sent_tokenize(text)
print("Sentences split :", sentences)

Sentences split : ['Hello Sreenivasa.', 'How are you?', "I hope you're learning NLP.", 'This is nice!']


In [20]:
text = "Natural Language Processing is an exciting field in Artificial Intelligence!"

# Tokenize
tokens = word_tokenize(text)
print("Tokens:", tokens)



Tokens: ['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'field', 'in', 'Artificial', 'Intelligence', '!']


In [21]:
stop_words = set(stopwords.words("english"))
print("stop_words:", stop_words)

stop_words: {'couldn', 'was', "it'd", 'have', "she'll", "that'll", 'ain', 'i', 'needn', 'same', 'whom', "she's", 'mightn', 'most', 'that', 'their', "we'd", "we've", 'in', 'can', 'won', 'were', 'over', 'very', 'before', "needn't", 'where', 'she', "you've", 'y', "wasn't", 'under', 'doing', 'each', 'are', 'at', "weren't", 'll', 'both', "haven't", 'own', 'then', 'him', 'hasn', 'o', 'only', 'while', "he's", 'being', 'up', "won't", 'those', 'we', 'you', 'by', "mustn't", 'until', 'some', 'these', 'to', 'aren', 'yourselves', "i'm", 'will', 'not', 'again', "i've", 'do', "we'll", 's', 'be', 'because', "should've", 'from', 't', 'more', 'other', 'for', 'has', 'herself', 'yourself', "you'll", 'with', 'doesn', 'ourselves', 'so', "they'd", "wouldn't", 'haven', 'wasn', 'out', 'but', 'after', 'is', 'why', 'which', 'down', 'the', 'when', 'her', 'about', 'didn', 'had', 'between', "hasn't", 've', 'ma', 'does', 'he', "shouldn't", "they'll", "don't", 'there', 'if', 'been', "it'll", 'any', 'shan', 'his', "do

In [22]:
# Remove stopwords
filtered = [word for word in tokens if word.lower() not in stop_words] # it is combination of for and if conditions.
print("Filtered Tokens:", filtered)

Filtered Tokens: ['Natural', 'Language', 'Processing', 'exciting', 'field', 'Artificial', 'Intelligence', '!']


In [23]:
# Stemmer (Most common)
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "runs", "runner", "easily", "fairly", "happiness"]

for w in words:
    print(w, "→", ps.stem(w))

running → run
runs → run
runner → runner
easily → easili
fairly → fairli
happiness → happi


In [25]:
#1. Porter Stemmer (Classic & widely used)
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk



ps = PorterStemmer()

words = ["running", "runner", "easily", "fairly", "happiness", "flying", "cars"]

for w in words:
    print(f"{w:12} → {ps.stem(w)}")

running      → run
runner       → runner
easily       → easili
fairly       → fairli
happiness    → happi
flying       → fli
cars         → car


In [27]:
#2. Snowball Stemmer (Better & cleaner version)
from nltk.stem import SnowballStemmer

snow = SnowballStemmer("english")

words = ["playing", "plays", "played", "automation", "automated"]

for w in words:
    print(f"{w:12} → {snow.stem(w)}")

playing      → play
plays        → play
played       → play
automation   → autom
automated    → autom


In [29]:
# 3. Lancaster Stemmer (Very aggressive)

from nltk.stem import LancasterStemmer

ls = LancasterStemmer()

words = ["running", "runner", "nationality", "happiness", "maximum", "taking"]

for w in words:
    print(f"{w:12} → {ls.stem(w)}")

running      → run
runner       → run
nationality  → nat
happiness    → happy
maximum      → maxim
taking       → tak


In [31]:
# Lemmatizer Code in Python (NLTK — simple version)

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


lemmatizer = WordNetLemmatizer()

text = "The children are running and the cats were eating their meals."

tokens = word_tokenize(text)

for w in tokens:
    print(f"{w:12} → {lemmatizer.lemmatize(w)}")

The          → The
children     → child
are          → are
running      → running
and          → and
the          → the
cats         → cat
were         → were
eating       → eating
their        → their
meals        → meal
.            → .


In [32]:
# Stopwords + Lemmatizer in Pypthon

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    """
    Removes stopwords + lemmatizes words.
    No POS tagging (simple lemmatizer).
    """
    tokens = word_tokenize(text)

    result = []
    for w in tokens:
        w_lower = w.lower()

        # skip stopwords & non-alphabetic words
        if w_lower in stop_words or not w_lower.isalpha():
            continue

        # lemmatize
        lemma = lemmatizer.lemmatize(w_lower)
        result.append(lemma)

    return result


# Example usage
text = "The children were eating apples happily while the cats were running."

output = clean_text(text)
print(output)


['child', 'eating', 'apple', 'happily', 'cat', 'running']


In [35]:
# POS Tagging Code in Python

import nltk

sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [37]:
# FULL NLP PIPELINE CODE (Token → Stopwords → POS → Lemma)

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


# Initialize
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


# Helper to map POS tags → WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default


def nlp_pipeline(text):
    # Tokenization
    tokens = word_tokenize(text)

    # POS tagging
    pos_tags = nltk.pos_tag(tokens)

    final_words = []

    for word, tag in pos_tags:
        w = word.lower()

        # Skip stopwords & non-alphabetic
        if w in stop_words or not w.isalpha():
            continue

        # Lemmatize using POS
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(w, wn_pos)

        final_words.append({
            "word": word,
            "pos": tag,
            "lemma": lemma
        })

    return final_words


# Example usage
text = "The children are eating apples happily while the cats were running fast."

output = nlp_pipeline(text)

for item in output:
    print(f"WORD: {item['word']:12}  POS: {item['pos']:6}  LEMMA: {item['lemma']}")




WORD: children      POS: NNS     LEMMA: child
WORD: eating        POS: VBG     LEMMA: eat
WORD: apples        POS: NNS     LEMMA: apple
WORD: happily       POS: RB      LEMMA: happily
WORD: cats          POS: NNS     LEMMA: cat
WORD: running       POS: VBG     LEMMA: run
WORD: fast          POS: RB      LEMMA: fast


In [42]:
# show how the same word gets different POS tags in different sentences.

#What this does

#Tokenizes each sentence
#Runs POS tagging
#Prints each token with its POS tag
#Marks the target word (<-- target) so you can see how its tag changes (e.g., VB vs NN, etc.)

import nltk


def show_pos(word, sentences):
    print(f"\n=== Word: '{word}' ===")
    for s in sentences:
        tokens = nltk.word_tokenize(s)
        tags = nltk.pos_tag(tokens)

        print(f"\nSentence: {s}")
        print("Tokens + POS:")
        for tok, tag in tags:
            mark = "  <-- target" if tok.lower() == word.lower() else ""
            print(f"  {tok:12} {tag:5} {mark}")
    print("-" * 40)


# Examples for different words
run_sentences = [
    "I run every morning.",
    "He went for a run.",
]

light_sentences = [
    "This bag is light.",
    "Turn on the light.",
    "Please light the candle.",
]

book_sentences = [
    "I read a book.",
    "I will book a ticket.",
]

watch_sentences = [
    "I watch movies at night.",
    "I bought a new watch yesterday.",
]

park_sentences = [
    "We went to the park.",
    "Please park the car here.",
]

# Show POS behavior
show_pos("run", run_sentences)
show_pos("light", light_sentences)
show_pos("book", book_sentences)
show_pos("watch", watch_sentences)
show_pos("park", park_sentences)




=== Word: 'run' ===

Sentence: I run every morning.
Tokens + POS:
  I            PRP   
  run          VBP     <-- target
  every        DT    
  morning      NN    
  .            .     

Sentence: He went for a run.
Tokens + POS:
  He           PRP   
  went         VBD   
  for          IN    
  a            DT    
  run          NN      <-- target
  .            .     
----------------------------------------

=== Word: 'light' ===

Sentence: This bag is light.
Tokens + POS:
  This         DT    
  bag          NN    
  is           VBZ   
  light        JJ      <-- target
  .            .     

Sentence: Turn on the light.
Tokens + POS:
  Turn         NN    
  on           IN    
  the          DT    
  light        NN      <-- target
  .            .     

Sentence: Please light the candle.
Tokens + POS:
  Please       NNP   
  light        VBD     <-- target
  the          DT    
  candle       NN    
  .            .     
----------------------------------------

=== Word: 'bo

In [45]:
# Python Code — NLTK Chunking Example

import nltk
sentence = "The quick brown fox jumps over the lazy dog."

# Step 1: Tokenization
tokens = nltk.word_tokenize(sentence)

# Step 2: POS Tagging
pos_tags = nltk.pos_tag(tokens)

# Step 3: Chunk Grammar Rule (Noun Phrase)
grammar = r"""
    NP: {<DT>?<JJ>*<NN>}   # NP = optional determiner + adjectives + noun
"""

# Step 4: Create chunk parser
chunk_parser = nltk.RegexpParser(grammar)

# Step 5: Parse (chunking)
chunk_tree = chunk_parser.parse(pos_tags)

print(chunk_tree)
chunk_tree.draw()   # graphical tree window



(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)
