# Task: Text Normalization Challenge

input_text

In [3]:
text = """Natural Language Processing helps computers understand, interpret, and generate human language. 
It involves techniques like tokenization, stemming, and lemmatization to clean text data efficiently.
"""

Requirements

import nltk
nltk.download('punkt')  # for tokenizers
nltk.download("punkt_tab")
nltk.download("stopwords") # for stopwords
nlk.download("wordnet") # for stemming and lemma

## Tokenize the paragraph

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize

# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)

# Word Tokenization
words_tokens = word_tokenize(text)
print("Word Tokenization:", words_tokens)

word_punctuation = wordpunct_tokenize(text)
print("Word Punct Tokenization:", text)


print("Total words before preprocessing:", len(words_tokens))

Sentence Tokenization: ['Natural Language Processing helps computers understand, interpret, and generate human language.', 'It involves techniques like tokenization, stemming, and lemmatization to clean text data efficiently.']
Word Tokenization: ['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', '.', 'It', 'involves', 'techniques', 'like', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', 'to', 'clean', 'text', 'data', 'efficiently', '.']
Word Punct Tokenization: Natural Language Processing helps computers understand, interpret, and generate human language. 
It involves techniques like tokenization, stemming, and lemmatization to clean text data efficiently.

Total words before preprocessing: 30


## Remove stopwords


In [17]:
from nltk.corpus import stopwords
stopwrd =set(stopwords.words("english"))
# print("STOP WORDS IN NLTK :\n", stopwrd)
filtered_words = [ word  for word in words_tokens if word.lower() not in stopwrd ]
print("After stopword removal:", len(filtered_words))


After stopword removal: 26


## Apply stemming

In [20]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_words = []

for w in filtered_words:
    stemmed_words.append(ps.stem(w))
    print(w, " --> ", ps.stem(w))

print("Stemmed words:", stemmed_words)

Natural  -->  natur
Language  -->  languag
Processing  -->  process
helps  -->  help
computers  -->  comput
understand  -->  understand
,  -->  ,
interpret  -->  interpret
,  -->  ,
generate  -->  gener
human  -->  human
language  -->  languag
.  -->  .
involves  -->  involv
techniques  -->  techniqu
like  -->  like
tokenization  -->  token
,  -->  ,
stemming  -->  stem
,  -->  ,
lemmatization  -->  lemmat
clean  -->  clean
text  -->  text
data  -->  data
efficiently  -->  effici
.  -->  .
Stemmed words: ['natur', 'languag', 'process', 'help', 'comput', 'understand', ',', 'interpret', ',', 'gener', 'human', 'languag', '.', 'involv', 'techniqu', 'like', 'token', ',', 'stem', ',', 'lemmat', 'clean', 'text', 'data', 'effici', '.']


## Apply lemmatization


In [21]:
from nltk.stem import WordNetLemmatizer
ws = WordNetLemmatizer()
lemma_words = []
for w in filtered_words:
    print(w, " --> ", ws.lemmatize(w))
    lemma_words.append(ws.lemmatize(w))

print("Lemmatized words:", lemma_words)

Natural  -->  Natural
Language  -->  Language
Processing  -->  Processing
helps  -->  help
computers  -->  computer
understand  -->  understand
,  -->  ,
interpret  -->  interpret
,  -->  ,
generate  -->  generate
human  -->  human
language  -->  language
.  -->  .
involves  -->  involves
techniques  -->  technique
like  -->  like
tokenization  -->  tokenization
,  -->  ,
stemming  -->  stemming
,  -->  ,
lemmatization  -->  lemmatization
clean  -->  clean
text  -->  text
data  -->  data
efficiently  -->  efficiently
.  -->  .
Lemmatized words: ['Natural', 'Language', 'Processing', 'help', 'computer', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', '.', 'involves', 'technique', 'like', 'tokenization', ',', 'stemming', ',', 'lemmatization', 'clean', 'text', 'data', 'efficiently', '.']


In [29]:

' '.join(lemma_words)

'Natural Language Processing help computer understand , interpret , generate human language . involves technique like tokenization , stemming , lemmatization clean text data efficiently .'

stemmed_words, lemmatized_words generated differently that are not in word_tokens

In [24]:
unique_stem = [word for word in stemmed_words if word not in filtered_words]
print(unique_stem)

['natur', 'languag', 'process', 'help', 'comput', 'gener', 'languag', 'involv', 'techniqu', 'token', 'stem', 'lemmat', 'effici']


In [25]:
unique_lemma = [word for word in lemma_words if word not in filtered_words]
print(unique_lemma)

['help', 'computer', 'technique']


# Unique tokens in stemming and lemma,  count distinct words only, ignoring duplicates.

In [27]:
# Unique stemmed tokens
unique_stemmed_tokens = set(stemmed_words)
print("Unique tokens after stemming:", len(unique_stemmed_tokens))

# Unique lemmatized tokens
unique_lemmatized_tokens = set(lemma_words)
print("Unique tokens after lemmatization:", len(unique_lemmatized_tokens))


Unique tokens after stemming: 21
Unique tokens after lemmatization: 22


## Compare

Which version (stemmed or lemmatized) gives more readable words?
- lemmatized words are more readable than stemming

How many unique tokens remain after both?
- Unique tokens after stemming: 21
- Unique tokens after lemmatization: 22

In [None]:
## 