In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ronit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ronit\AppData\Roaming\nltk_data...


True

In [2]:
# 1. Tokenize sentence
sentence1 = "Natural Language Processing with Python is fun."
tokens1 = word_tokenize(sentence1)
print("Tokens:", tokens1)

Tokens: ['Natural', 'Language', 'Processing', 'with', 'Python', 'is', 'fun', '.']


In [3]:
# 2. Remove punctuation
sentence2 = "Hello there! How's the weather today?"
tokens2 = []
for word in word_tokenize(sentence2):
    if word.isalnum():
        tokens2.append(word)
print("No Punctuation:", tokens2)

No Punctuation: ['Hello', 'there', 'How', 'the', 'weather', 'today']


In [4]:
# 3. Remove stopwords
sentence3 = "This is a simple sentence for stopword removal."
tokens3 = []
for word in word_tokenize(sentence3):
    if word.lower() not in stopwords.words('english'):
        tokens3.append(word)
print("No Stopwords:", tokens3)

No Stopwords: ['simple', 'sentence', 'stopword', 'removal', '.']


In [5]:
# 4. Stemming with PorterStemmer
sentence4 = "The striped bats are hanging on their feet for best."
stemmer = PorterStemmer()
tokens4 = []
for word in word_tokenize(sentence4):
    tokens4.append(stemmer.stem(word))
print("Stemmed:", tokens4)

Stemmed: ['the', 'stripe', 'bat', 'are', 'hang', 'on', 'their', 'feet', 'for', 'best', '.']


In [6]:
# 5. Lemmatization with WordNetLemmatizer
sentence5 = "The geese are flying south for the winter."
lemmatizer = WordNetLemmatizer()
tokens5 = []
for word in word_tokenize(sentence5):
    tokens5.append(lemmatizer.lemmatize(word))
print("Lemmatized:", tokens5)

Lemmatized: ['The', 'goose', 'are', 'flying', 'south', 'for', 'the', 'winter', '.']


In [7]:
# 6. Convert to lowercase & remove punctuation
sentence6 = "Hello, World! NLP with Python."
tokens6 = []
for word in word_tokenize(sentence6):
    if word.isalnum():
        tokens6.append(word.lower())
print("Lowercase & No Punctuation:", tokens6)

Lowercase & No Punctuation: ['hello', 'world', 'nlp', 'with', 'python']


In [8]:
# 7. Sentence tokenization
sentence7 = "Hello World. This is NLTK. Let's explore NLP!"
sentences7 = sent_tokenize(sentence7)
print("Sentence Tokens:", sentences7)

Sentence Tokens: ['Hello World.', 'This is NLTK.', "Let's explore NLP!"]


In [9]:
# 8. LancasterStemmer
sentence8 = "Loving the experience of learning NLTK"
lancaster_stemmer = LancasterStemmer()
tokens8 = []
for word in word_tokenize(sentence8):
    tokens8.append(lancaster_stemmer.stem(word))
print("Lancaster Stemmed:", tokens8)

Lancaster Stemmed: ['lov', 'the', 'expery', 'of', 'learn', 'nltk']


In [10]:
# 9. Remove stopwords & punctuation
sentence9 = "This is a test sentence, with stopwords and punctuation!"
tokens9 = []
for word in word_tokenize(sentence9):
    if word.isalnum() and word.lower() not in stopwords.words('english'):
        tokens9.append(word)
print("No Stopwords & No Punctuation:", tokens9)

No Stopwords & No Punctuation: ['test', 'sentence', 'stopwords', 'punctuation']


In [11]:
# 10. Lemmatization with POS tagging
sentence10 = "The striped bats are hanging on their feet."
tokens10 = []
for word in word_tokenize(sentence10):
    tokens10.append(lemmatizer.lemmatize(word))
print("Lemmatized with POS:", tokens10)

Lemmatized with POS: ['The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot', '.']


In [12]:
# 11. Tokenize, remove stopwords, punctuation, & stemming
sentence11 = "Running through the forest, the fox is faster."
tokens11 = []
for word in word_tokenize(sentence11):
    if word.isalnum() and word.lower() not in stopwords.words('english'):
        tokens11.append(stemmer.stem(word))
print("Tokenized, No Stopwords & Stemmed:", tokens11)

Tokenized, No Stopwords & Stemmed: ['run', 'forest', 'fox', 'faster']


In [13]:
# 12. Count stopwords
sentence12 = "This is an example sentence for counting stopwords."
stopword_count12 = 0
for word in word_tokenize(sentence12):
    if word.lower() in stopwords.words('english'):
        stopword_count12 += 1
print("Stopword Count:", stopword_count12)

Stopword Count: 4


In [14]:
# 13. Stemming & punctuation removal with RegexTokenizer
sentence13 = "Stemming, punctuation! Removal example."
regex_tokenizer = RegexpTokenizer(r'\w+')
tokens13 = []
for word in regex_tokenizer.tokenize(sentence13):
    tokens13.append(stemmer.stem(word))
print("Regex Tokenizer & Stemmed:", tokens13)

Regex Tokenizer & Stemmed: ['stem', 'punctuat', 'remov', 'exampl']


In [15]:
# 14. Remove punctuation with regex
sentence14 = "Punctuation removal with regex in NLP!"
tokens14 = re.sub(r'[^\w\s]', '', sentence14)
print("Regex Punctuation Removal:", tokens14)

Regex Punctuation Removal: Punctuation removal with regex in NLP


In [16]:
# 15. Tokenize, remove stopwords, and lemmatize
sentence15 = "The dogs are barking loudly."
tokens15 = []
for word in word_tokenize(sentence15):
    if word.isalnum() and word.lower() not in stopwords.words('english'):
        tokens15.append(lemmatizer.lemmatize(word))
print("Tokenized, No Stopwords & Lemmatized:", tokens15)

Tokenized, No Stopwords & Lemmatized: ['dog', 'barking', 'loudly']
