In [3]:
import nltk
from nltk.stem.snowball import SnowballStemmer

# Download NLTK data (if not already done)
nltk.download('punkt')
# Download the 'punkt_tab' resource for German tokenization
nltk.download('punkt_tab') # This line was added to fix the LookupError


# Initialize the German stemmer
stemmer = SnowballStemmer("german")

# Sample German sentence
german_text = "Die Kinder spielen im Garten und liefen schnell zur Schule."

# Tokenize the sentence into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(german_text, language='german')

# Perform stemming
stemmed_words = [stemmer.stem(word) for word in tokens]

# Show results
print("Original Words: ", tokens)
print("Stemmed Words:  ", stemmed_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Words:  ['Die', 'Kinder', 'spielen', 'im', 'Garten', 'und', 'liefen', 'schnell', 'zur', 'Schule', '.']
Stemmed Words:   ['die', 'kind', 'spiel', 'im', 'gart', 'und', 'lief', 'schnell', 'zur', 'schul', '.']


✅ Final Code with Explanation

import nltk
from nltk.stem.snowball import SnowballStemmer

    Imports NLTK for natural language processing tasks.

    Imports the German stemmer from NLTK's SnowballStemmer class.

# Download NLTK data (if not already done)
nltk.download('punkt')

    Downloads the Punkt tokenizer models used for tokenizing text into sentences and words.

# Download the 'punkt_tab' resource for German tokenization
nltk.download('punkt_tab')  # This line was added to fix the LookupError

    ✅ This is the fix: Downloads a special file (punkt_tab) needed when using language='german' with word_tokenize.

    It contains language-specific tokenization data for German.

# Initialize the German stemmer
stemmer = SnowballStemmer("german")

    Creates a German language stemmer object to reduce words to their root forms.

# Sample German sentence
german_text = "Die Kinder spielen im Garten und liefen schnell zur Schule."

    A sample German sentence for testing.

# Tokenize the sentence into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(german_text, language='german')

    Tokenizes the German sentence into individual word tokens using NLTK.

    language='german' ensures proper handling of German punctuation and word structures.

# Perform stemming
stemmed_words = [stemmer.stem(word) for word in tokens]

    Applies stemming to each word to reduce it to its base or root form.

# Show results
print("Original Words: ", tokens)
print("Stemmed Words:  ", stemmed_words)

    Displays the original words and their corresponding stemmed forms for comparison.

📌 Example Output (What you might see):

Original Words:  ['Die', 'Kinder', 'spielen', 'im', 'Garten', 'und', 'liefen', 'schnell', 'zur', 'Schule', '.']
Stemmed Words:   ['die', 'kind', 'spiel', 'im', 'gart', 'und', 'lief', 'schnell', 'zur', 'schul', '.']

🎓 Tip for Viva:

If they ask why stemming is important:

    "Stemming helps normalize words by reducing them to their root form. This improves text matching and reduces dimensionality in NLP tasks like search engines, sentiment analysis, or classification."