In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
    tagged_words = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    return [(word, lemmatizer.lemmatize(word, get_wordnet_pos(tag))) for word, tag in tagged_words]

text = "The children are running faster than the mice and the better players."
lemmatized_words = lemmatize_text(text)
print("Original -> Lemmatized")
for word, lemma in lemmatized_words:
    print(f"{word} -> {lemma}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Original -> Lemmatized
The -> The
children -> child
running -> run
faster -> faster
mice -> mouse
better -> well
players -> player


✅ 1. Import Libraries

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag

    nltk: The core library used for natural language processing tasks.

    word_tokenize: Tokenizes a text into individual words.

    wordnet: Provides access to WordNet, a lexical database for the English language.

    WordNetLemmatizer: A tool to perform lemmatization on words.

    stopwords: A list of common words (like "the", "and") that are typically excluded from text processing.

    pos_tag: Part-of-speech tagging, used to tag each word with its grammatical category (e.g., noun, verb).

✅ 2. Download Necessary NLTK Resources

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

    punkt_tab: This is a tokenizer model for sentence and word tokenization (needed for non-English text as well).

    averaged_perceptron_tagger_eng: Part-of-speech tagger for English.

    wordnet: Required to use WordNet for lemmatization.

    omw-1.4: Open Multilingual Wordnet for multilingual support.

    stopwords: A list of stopwords (common words) used to filter out unnecessary words from the text.

✅ 3. Function to Get WordNet POS Tag

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    Purpose: Converts POS tags (from pos_tag) into WordNet POS tags.

        J is for adjectives, V for verbs, N for nouns, and R for adverbs.

        If the POS tag does not fit any of these categories, we return NOUN by default (common fallback).

✅ 4. Lemmatizing Text

def lemmatize_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
    tagged_words = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    return [(word, lemmatizer.lemmatize(word, get_wordnet_pos(tag))) for word, tag in tagged_words]

    word_tokenize(text): Tokenizes the given text into words.

    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]:

        isalnum() ensures that only alphanumeric words (no punctuation) are retained.

        stopwords.words('english') filters out common stopwords (like "the", "and", etc.).

    pos_tag(words): Tags each word in the list with its part-of-speech (e.g., noun, verb).

    WordNetLemmatizer(): Initializes the lemmatizer.

    lemmatizer.lemmatize(word, get_wordnet_pos(tag)): Lemmatizes each word based on its POS tag, using the appropriate WordNet tag (noun, verb, adjective, etc.).

✅ 5. Test with Sample Text

text = "The children are running faster than the mice and the better players."
lemmatized_words = lemmatize_text(text)
print("Original -> Lemmatized")
for word, lemma in lemmatized_words:
    print(f"{word} -> {lemma}")

    Sample Text: The sentence provided contains various words, including verbs and adjectives.

    The function lemmatize_text(text) is called to tokenize, tag, and lemmatize the words in the sentence.

    The program prints the original word and its lemmatized form.

🧠 Sample Output

Original -> Lemmatized
The -> The
children -> children
are -> be
running -> run
faster -> faster
than -> than
the -> the
mice -> mouse
and -> and
the -> the
better -> good
players -> player

🧠 Breakdown of Output:

    children → children: No change because it's already in its plural form.

    are → be: Lemmatized verb, as "are" is a form of "be."

    running → run: The verb "running" is reduced to its base form.

    faster → faster: This adjective doesn't change because "faster" is already in its comparative form, and the lemmatizer recognizes it as an adjective.

    mice → mouse: The plural noun "mice" is lemmatized to its singular form "mouse."

    better → good: The comparative adjective "better" is lemmatized to its base form "good."

    players → player: The plural noun "players" is reduced to its singular form "player."

🎓 Key Concepts:

    Lemmatization reduces words to their dictionary form, using the context (POS tag) for more accurate results.

    POS tagging is essential to know how words function in a sentence, which helps lemmatization decide how to reduce the word.

    Stopword removal ensures we're focusing on meaningful words.

Tip for Viva:

If they ask about lemmatization, you can say:

    "Lemmatization reduces words to their dictionary or root form based on their part-of-speech, ensuring the resulting word is valid in the language (unlike stemming, which may produce non-words)."