In [1]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK data files (if not already done)
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = "Natural Language Processing (NLP) is a field of AI. It's important for analyzing text data. There are many things to learn!"

# 1. Tokenization
def tokenize_text(text):
    return (text)

# 2. Normalization (Lowercase, removing special characters)
def normalize_text(text):
    # Convert text to lowercase and remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

# 3. Stopword Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# 4. Stemming
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

# Apply all steps
normalized_text = normalize_text(text)
tokens = tokenize_text(normalized_text)
tokens_without_stopwords = remove_stopwords(tokens)
stemmed_tokens = stem_tokens(tokens_without_stopwords)

# Output results
print("Original Text:", text)
print("Normalized Text:", normalized_text)
print("Tokens:", tokens)
print("Tokens without Stopwords:", tokens_without_stopwords)
print("Stemmed Tokens:", stemmed_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Original Text: Natural Language Processing (NLP) is a field of AI. It's important for analyzing text data. There are many things to learn!
Normalized Text: natural language processing nlp is a field of ai its important for analyzing text data there are many things to learn
Tokens: natural language processing nlp is a field of ai its important for analyzing text data there are many things to learn
Tokens without Stopwords: ['n', 'u', 'r', 'l', ' ', 'l', 'n', 'g', 'u', 'g', 'e', ' ', 'p', 'r', 'c', 'e', 'n', 'g', ' ', 'n', 'l', 'p', ' ', ' ', ' ', 'f', 'e', 'l', ' ', 'f', ' ', ' ', ' ', 'p', 'r', 'n', ' ', 'f', 'r', ' ', 'n', 'l', 'z', 'n', 'g', ' ', 'e', 'x', ' ', ' ', 'h', 'e', 'r', 'e', ' ', 'r', 'e', ' ', 'n', ' ', 'h', 'n', 'g', ' ', ' ', 'l', 'e', 'r', 'n']
Stemmed Tokens: ['n', 'u', 'r', 'l', ' ', 'l', 'n', 'g', 'u', 'g', 'e', ' ', 'p', 'r', 'c', 'e', 'n', 'g', ' ', 'n', 'l', 'p', ' ', ' ', ' ', 'f', 'e', 'l', ' ', 'f', ' ', ' ', ' ', 'p', 'r', 'n', ' ', 'f', 'r', ' ', 'n', 'l', '