In [6]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary data
# The original code only downloaded 'punkt', but 'punkt_tab' is also required
nltk.download('punkt')
nltk.download('punkt_tab')  # Download the missing data package

text = "Natural Language Processing is amazing. It helps computers understand human language."

# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Word Tokenization
words = word_tokenize(text)
print("Words:", words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


Sentences: ['Natural Language Processing is amazing.', 'It helps computers understand human language.']
Words: ['Natural', 'Language', 'Processing', 'is', 'amazing', '.', 'It', 'helps', 'computers', 'understand', 'human', 'language', '.']


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

Filtered Words: ['Natural', 'Language', 'Processing', 'amazing', '.', 'helps', 'computers', 'understand', 'human', 'language', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['natur', 'languag', 'process', 'amaz', '.', 'help', 'comput', 'understand', 'human', 'languag', '.']


In [9]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(" ".join(filtered_words))

lemmatized_words = [token.lemma_ for token in doc]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['Natural', 'Language', 'processing', 'amazing', '.', 'help', 'computer', 'understand', 'human', 'language', '.']
