In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Sample text document
text = """
Natural Language Processing (NLP) is a sub-field of artificial intelligence. It involves the interaction between computers and human languages. 
The goal of NLP is to enable computers to understand, interpret, and generate human language.
"""


In [4]:
# Step 4: Tokenization (Splitting the text into words)
tokens = word_tokenize(text)

# Output tokens
print("Tokens after Tokenization:", tokens)

Tokens after Tokenization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'sub-field', 'of', 'artificial', 'intelligence', '.', 'It', 'involves', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'languages', '.', 'The', 'goal', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', '.']


In [5]:
# Step 5: Stopword Removal (Remove common words like "the", "is", etc.)
stop_words = set(stopwords.words('english'))  # Set of English stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Output filtered tokens
print("\nTokens after Stopword Removal:", filtered_tokens)



Tokens after Stopword Removal: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'sub-field', 'artificial', 'intelligence', '.', 'involves', 'interaction', 'computers', 'human', 'languages', '.', 'goal', 'NLP', 'enable', 'computers', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', '.']


In [6]:
# Step 6: Remove Punctuation
punctuation = set(string.punctuation)  # Set of punctuation marks
filtered_tokens = [word for word in filtered_tokens if word not in punctuation]

# Output tokens after punctuation removal
print("\nTokens after Punctuation Removal:", filtered_tokens)



Tokens after Punctuation Removal: ['Natural', 'Language', 'Processing', 'NLP', 'sub-field', 'artificial', 'intelligence', 'involves', 'interaction', 'computers', 'human', 'languages', 'goal', 'NLP', 'enable', 'computers', 'understand', 'interpret', 'generate', 'human', 'language']


In [7]:
# Step 7: Stemming (Reduce words to their root form)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Output stemmed tokens
print("\nTokens after Stemming:", stemmed_tokens)



Tokens after Stemming: ['natur', 'languag', 'process', 'nlp', 'sub-field', 'artifici', 'intellig', 'involv', 'interact', 'comput', 'human', 'languag', 'goal', 'nlp', 'enabl', 'comput', 'understand', 'interpret', 'gener', 'human', 'languag']


In [8]:
# Step 8: Lowercasing (Convert all words to lowercase)
processed_tokens = [word.lower() for word in stemmed_tokens]

# Output tokens after lowercasing
print("\nTokens after Lowercasing:", processed_tokens)



Tokens after Lowercasing: ['natur', 'languag', 'process', 'nlp', 'sub-field', 'artifici', 'intellig', 'involv', 'interact', 'comput', 'human', 'languag', 'goal', 'nlp', 'enabl', 'comput', 'understand', 'interpret', 'gener', 'human', 'languag']


In [9]:
stemmer = PorterStemmer()

# Create a dictionary to store original word and its corresponding stemmed word
original_and_stemmed = {word: stemmer.stem(word) for word in filtered_tokens}

# Output original and stemmed word pairs
print("\nOriginal and Stemmed Words:")
for original, stemmed in original_and_stemmed.items():
    print(f"Original: {original} -> Stemmed: {stemmed}")
    


Original and Stemmed Words:
Original: Natural -> Stemmed: natur
Original: Language -> Stemmed: languag
Original: Processing -> Stemmed: process
Original: NLP -> Stemmed: nlp
Original: sub-field -> Stemmed: sub-field
Original: artificial -> Stemmed: artifici
Original: intelligence -> Stemmed: intellig
Original: involves -> Stemmed: involv
Original: interaction -> Stemmed: interact
Original: computers -> Stemmed: comput
Original: human -> Stemmed: human
Original: languages -> Stemmed: languag
Original: goal -> Stemmed: goal
Original: enable -> Stemmed: enabl
Original: understand -> Stemmed: understand
Original: interpret -> Stemmed: interpret
Original: generate -> Stemmed: gener
Original: language -> Stemmed: languag
