# Stemming vs Lemmatization in NLP

This notebook demonstrates the differences between stemming and lemmatization in Natural Language Processing, using NLTK and spaCy libraries.

In [None]:
# Import required libraries
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## 1. Basic Stemming with Porter Stemmer

In [None]:
# Initialize Porter Stemmer
porter = PorterStemmer()

# Example words to stem
words = ['running', 'runs', 'ran', 'runner', 'easily', 'fairly', 'fairness']

# Apply stemming
stemmed_words = [porter.stem(word) for word in words]

# Display results
for original, stemmed in zip(words, stemmed_words):
    print(f"{original} -> {stemmed}")

## 2. Lemmatization with WordNet Lemmatizer

In [None]:
# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Example words to lemmatize
words = ['running', 'runs', 'ran', 'runner', 'easily', 'fairly', 'fairness']

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

# Display results
for original, lemmatized in zip(words, lemmatized_words):
    print(f"{original} -> {lemmatized}")

## 3. Comparison with spaCy

In [None]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Example text
text = "The running man runs easily and fairly. The runner ran yesterday."

# Process text with spaCy
doc = nlp(text)

# Display lemmatization results
print("spaCy Lemmatization:")
for token in doc:
    print(f"{token.text} -> {token.lemma_}")

## 4. Real-world Example with Sentence

In [None]:
# Example sentence
sentence = "The cats are running and jumping in the garden while the dogs are barking loudly."

# Tokenize sentence
tokens = word_tokenize(sentence)

# Apply stemming
stemmed_tokens = [porter.stem(token) for token in tokens]

# Apply lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Display results
print("Original tokens:", tokens)
print("\nStemmed tokens:", stemmed_tokens)
print("\nLemmatized tokens:", lemmatized_tokens)

## 5. Performance Comparison

In [None]:
import time

# Create a larger text for testing
test_text = "The running man runs easily and fairly. The runner ran yesterday. " * 100
tokens = word_tokenize(test_text)

# Test stemming performance
start_time = time.time()
stemmed = [porter.stem(token) for token in tokens]
stemming_time = time.time() - start_time

# Test lemmatization performance
start_time = time.time()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
lemmatization_time = time.time() - start_time

print(f"Stemming time: {stemming_time:.4f} seconds")
print(f"Lemmatization time: {lemmatization_time:.4f} seconds")

## 6. When to Use Each Method

### Stemming
- Faster processing
- Good for information retrieval
- Less accurate but more aggressive
- Useful for search engines

### Lemmatization
- More accurate results
- Preserves meaning
- Slower processing
- Better for text analysis and understanding