In [1]:
import nltk

# Download necessary NLTK data (do this once)
# You might get a pop-up window, select 'punkt' and 'stopwords' and click download.
# If you don't get a pop-up, you can try:
# nltk.download('punkt')
# nltk.download('stopwords')
# If you get a "Resource 'tokenizers/punkt' not found" error, run the above downloads.

from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "Hello, world! This is a sample sentence for NLP preprocessing. It's quite interesting."

# 1. Word Tokenization: Splitting text into individual words
words = word_tokenize(text)
print(f"Original Text: {text}")
print(f"Word Tokens: {words}")

# 2. Sentence Tokenization: Splitting text into individual sentences
sentences = sent_tokenize(text)
print(f"Sentence Tokens: {sentences}")

Original Text: Hello, world! This is a sample sentence for NLP preprocessing. It's quite interesting.
Word Tokens: ['Hello', ',', 'world', '!', 'This', 'is', 'a', 'sample', 'sentence', 'for', 'NLP', 'preprocessing', '.', 'It', "'s", 'quite', 'interesting', '.']
Sentence Tokens: ['Hello, world!', 'This is a sample sentence for NLP preprocessing.', "It's quite interesting."]


In [3]:
import string

text = "Hello, World! This is a Sample Sentence with Punctuation."

# 1. Lowercasing
lower_text = text.lower()
print(f"Original Text: {text}")
print(f"Lowercased Text: {lower_text}")

# 2. Removing Punctuation
# Method A: Using str.translate and string.punctuation
# This creates a translation table that maps each punctuation character to None (effectively deleting it)
translator = str.maketrans('', '', string.punctuation)
no_punct_text_a = text.translate(translator)
print(f"Text without punctuation (Method A): {no_punct_text_a}")

# Method B: Using Regular Expressions (re module)
import re
# re.sub(pattern, replacement, string)
# [^\w\s] matches any character that is NOT a word character (alphanumeric) and NOT a whitespace character
no_punct_text_b = re.sub(r'[^\w\s]', '', text)
print(f"Text without punctuation (Method B - regex): {no_punct_text_b}")

# Combine: Lowercase and remove punctuation
cleaned_text_combined = re.sub(r'[^\w\s]', '', text.lower())
print(f"Combined (Lowercase & No Punctuation): {cleaned_text_combined}")

Original Text: Hello, World! This is a Sample Sentence with Punctuation.
Lowercased Text: hello, world! this is a sample sentence with punctuation.
Text without punctuation (Method A): Hello World This is a Sample Sentence with Punctuation
Text without punctuation (Method B - regex): Hello World This is a Sample Sentence with Punctuation
Combined (Lowercase & No Punctuation): hello world this is a sample sentence with punctuation


In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

sample_text = "This is a very important sentence demonstrating stop word removal."

# Get English stop words
stop_words = set(stopwords.words('english'))
print(f"First 10 English Stop Words: {list(stop_words)[:10]}")

# Tokenize the sample text
words = word_tokenize(sample_text)

# Remove stop words
filtered_words = [word for word in words if word.lower() not in stop_words]

print(f"\nOriginal Words: {words}")
print(f"Filtered Words (No Stop Words): {filtered_words}")

First 10 English Stop Words: ['my', "weren't", 'during', 'being', "you'd", 'once', 'an', "couldn't", 'yourselves', 'hasn']

Original Words: ['This', 'is', 'a', 'very', 'important', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.']
Filtered Words (No Stop Words): ['important', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.']


In [7]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download WordNet data for lemmatization (do this once)
# nltk.download('wordnet')
# nltk.download('omw-1.4') # Open Multilingual Wordnet (often needed with wordnet)

words_to_process = ["running", "runs", "ran", "generously", "universal", "better", "studying", "studies"]

# 1. Stemming using Porter Stemmer
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in words_to_process]
print(f"Original Words: {words_to_process}")
print(f"Stemmed Words: {stemmed_words}")

# 2. Lemmatization using WordNet Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# Lemmatization often requires specifying the Part of Speech (POS) tag for better accuracy.
# 'v' for verb, 'n' for noun, 'a' for adjective, 'r' for adverb.
# If POS is not specified, it defaults to 'n' (noun).
lemmatized_words_default = [wordnet_lemmatizer.lemmatize(word) for word in words_to_process]
print(f"Lemmatized Words (default POS=noun): {lemmatized_words_default}")

# Example with specified POS for better results for verbs/adjectives
lemmatized_words_pos = [
    wordnet_lemmatizer.lemmatize("running", pos='v'),
    wordnet_lemmatizer.lemmatize("runs", pos='v'),
    wordnet_lemmatizer.lemmatize("ran", pos='v'),
    wordnet_lemmatizer.lemmatize("better", pos='a'),
    wordnet_lemmatizer.lemmatize("studying", pos='v'),
    wordnet_lemmatizer.lemmatize("studies", pos='v')
]
print(f"Lemmatized Words (with specified POS): {lemmatized_words_pos}")

Original Words: ['running', 'runs', 'ran', 'generously', 'universal', 'better', 'studying', 'studies']
Stemmed Words: ['run', 'run', 'ran', 'gener', 'univers', 'better', 'studi', 'studi']
Lemmatized Words (default POS=noun): ['running', 'run', 'ran', 'generously', 'universal', 'better', 'studying', 'study']
Lemmatized Words (with specified POS): ['run', 'run', 'run', 'good', 'study', 'study']


In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Ensure necessary NLTK data is downloaded (if not already)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Initialize lemmatizer and stop words outside the function for efficiency
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Applies a series of preprocessing steps to a given text.
    - Lowercasing
    - Punctuation removal
    - Tokenization
    - Stop word removal
    - Lemmatization (defaults to noun if POS not determined)
    """
    # 1. Lowercasing
    text = text.lower()

    # 2. Punctuation Removal (using regex, similar to Method B from earlier)
    text = re.sub(r'[^\w\s]', '', text) # Keep only word characters (alphanumeric) and spaces

    # 3. Tokenization
    tokens = word_tokenize(text)

    # 4. Stop Word Removal
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # 5. Lemmatization
    # For better lemmatization, one would typically use POS tagging,
    # but for simplicity in this introductory exercise, we'll use default.
    # You can add logic here to determine POS if you wish to explore more advanced concepts later.
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return lemmatized_tokens

# Test the complete preprocessing function
sample_paragraph = "Natural Language Processing (NLP) is a fascinating field. It combines computer science, artificial intelligence, and linguistics. Many new advancements are running quickly, making machines understand human language better."

processed_paragraph_tokens = preprocess_text(sample_paragraph)
print(f"Original Paragraph:\n{sample_paragraph}")
print(f"\nProcessed Tokens:\n{processed_paragraph_tokens}")

Original Paragraph:
Natural Language Processing (NLP) is a fascinating field. It combines computer science, artificial intelligence, and linguistics. Many new advancements are running quickly, making machines understand human language better.

Processed Tokens:
['natural', 'language', 'processing', 'nlp', 'fascinating', 'field', 'combine', 'computer', 'science', 'artificial', 'intelligence', 'linguistics', 'many', 'new', 'advancement', 'running', 'quickly', 'making', 'machine', 'understand', 'human', 'language', 'better']
