In [28]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Hey, did you know that **NLP makes. language processing easy??"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Code Step-by-Step Explanation
1. Lowercase (text = text.lower())
Converts all text to lowercase so that ‘NLP’ and ‘nlp’ are treated as the same word.

This normalization ensures case differences don't fragment word statistics in further processing.

2. Remove Punctuation (text.translate(str.maketrans('', '', string.punctuation)))
Removes punctuation marks like commas, question marks, and exclamation marks, which usually don't carry semantic meaning in most NLP analyses.

Makes subsequent tokenization and matching cleaner by stripping non-word symbols.

3. Tokenize (tokens = word_tokenize(text))
Splits the sentence into individual words, called tokens, creating a list of the smallest meaningful text units for analysis.

Essential to convert unstructured text to a structured form suitable for modeling.

4. Remove Stopwords (tokens = [word for word in tokens if word not in stopwords.words('english')])
Filters out very common words like ‘the’, ‘and’, ‘is’, which don’t contribute significantly to semantic meaning or analysis.

Reduces noise by keeping only words likely to be relevant for NLP tasks.

5. Lemmatization (tokens = [lemmatizer.lemmatize(word) for word in tokens])
Reduces words to their base dictionary form (‘makes’ → ‘make’, ‘processing’ → ‘processing’).

Helps group similar concepts and shrinks vocabulary size, making analysis more robust.

In [29]:
!ls /root/nltk_data/

corpora  tokenizers


In [30]:
!ls /root/nltk_data/corpora/

stopwords  stopwords.zip  wordnet.zip


In [31]:
!ls /root/nltk_data/tokenizers/punkt/

czech.pickle	 french.pickle	   polish.pickle      spanish.pickle
danish.pickle	 german.pickle	   portuguese.pickle  swedish.pickle
dutch.pickle	 greek.pickle	   PY3		      turkish.pickle
english.pickle	 italian.pickle    README
estonian.pickle  malayalam.pickle  russian.pickle
finnish.pickle	 norwegian.pickle  slovene.pickle


In [32]:
!ls /root/nltk_data/corpora/stopwords/

albanian     bengali  english  hebrew	   kazakh      romanian  tajik
arabic	     catalan  finnish  hinglish    nepali      russian	 tamil
azerbaijani  chinese  french   hungarian   norwegian   slovene	 turkish
basque	     danish   german   indonesian  portuguese  spanish
belarusian   dutch    greek    italian	   README      swedish


In [33]:
!cat !ls /root/nltk_data/corpora/stopwords/english

cat: '!ls': No such file or directory
a
about
above
after
again
against
ain
all
am
an
and
any
are
aren
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can
couldn
couldn't
d
did
didn
didn't
do
does
doesn
doesn't
doing
don
don't
down
during
each
few
for
from
further
had
hadn
hadn't
has
hasn
hasn't
have
haven
haven't
having
he
he'd
he'll
her
here
hers
herself
he's
him
himself
his
how
i
i'd
if
i'll
i'm
in
into
is
isn
isn't
it
it'd
it'll
it's
its
itself
i've
just
ll
m
ma
me
mightn
mightn't
more
most
mustn
mustn't
my
myself
needn
needn't
no
nor
not
now
o
of
off
on
once
only
or
other
our
ours
ourselves
out
over
own
re
s
same
shan
shan't
she
she'd
she'll
she's
should
shouldn
shouldn't
should've
so
some
such
t
than
that
that'll
the
their
theirs
them
themselves
then
there
these
they
they'd
they'll
they're
they've
this
those
through
to
too
under
until
up
ve
very
was
wasn
wasn't
we
we'd
we'll
we're
were
weren
weren't
we've
what
when
where
which
while
who
whom
why
will
with
won


In [34]:
# 1. Lowercase
text = text.lower()
text

'hey, did you know that **nlp makes. language processing easy??'

In [35]:
# 2. Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
text

'hey did you know that nlp makes language processing easy'

In [36]:
# 3. Tokenize into words
import nltk
nltk.download('punkt_tab') # Download the missing resource
tokens = word_tokenize(text)
tokens

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['hey',
 'did',
 'you',
 'know',
 'that',
 'nlp',
 'makes',
 'language',
 'processing',
 'easy']

In [37]:
# 4. Remove stopwords
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens

['hey', 'know', 'nlp', 'makes', 'language', 'processing', 'easy']

In [38]:
# 5. Lemmatize words
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
tokens

['hey', 'know', 'nlp', 'make', 'language', 'processing', 'easy']