In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [16]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Sample text data
text_data = [
    "Stemming is the process of reducing words to their base or root form",
    "Lemmatization, on the other hand, aims to return the base or dictionary form of a word",
    "Tokenization is the process of breaking down text into words or smaller linguistic units"
]

In [19]:

# Initialize stemmer, lemmatizer, and tokenizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [20]:
# Preprocess function
def preprocess_text(text):
    # Tokenize text
    tokens = tokenizer.tokenize(text.lower())  # Convert to lowercase
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming and Lemmatization
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens, stemmed_tokens, lemmatized_tokens

In [21]:
# Process each text in the dataset
for idx, text in enumerate(text_data):
    tokens, stemmed_tokens, lemmatized_tokens = preprocess_text(text)

    print(f"Text {idx + 1}:")
    print("Original Tokens:", tokens)
    print("Stemmed Tokens:", stemmed_tokens)
    print("Lemmatized Tokens:", lemmatized_tokens)

Text 1:
Original Tokens: ['stemming', 'process', 'reducing', 'words', 'base', 'root', 'form']
Stemmed Tokens: ['stem', 'process', 'reduc', 'word', 'base', 'root', 'form']
Lemmatized Tokens: ['stemming', 'process', 'reducing', 'word', 'base', 'root', 'form']
Text 2:
Original Tokens: ['lemmatization', 'hand', 'aims', 'return', 'base', 'dictionary', 'form', 'word']
Stemmed Tokens: ['lemmat', 'hand', 'aim', 'return', 'base', 'dictionari', 'form', 'word']
Lemmatized Tokens: ['lemmatization', 'hand', 'aim', 'return', 'base', 'dictionary', 'form', 'word']
Text 3:
Original Tokens: ['tokenization', 'process', 'breaking', 'text', 'words', 'smaller', 'linguistic', 'units']
Stemmed Tokens: ['token', 'process', 'break', 'text', 'word', 'smaller', 'linguist', 'unit']
Lemmatized Tokens: ['tokenization', 'process', 'breaking', 'text', 'word', 'smaller', 'linguistic', 'unit']
