**Text Preprocessing**


*   Tokenization-- Word and sentence
*   Stopwords removal
*   Stemming
*   Lemmatization
*   Frequency distribution







**NLTK** (Natural Language ToolKit): Processing Natural Language Text

In [0]:
text='''In the early days, many language-processing systems were designed by hand-
        coding a set of rules,[9],[10] e.g. by writing grammars or devising heuristic 
        rules for stemming. However, this is rarely robust to natural language variation.
        Since the so-called "statistical revolution"[11][12] in the late 1980s and mid 
        1990s, much natural language processing research has relied heavily on machine 
        learning.The machine-learning paradigm calls instead for using statistical inference 
        to automatically learn such rules through the analysis of large corpora of typical
        real-world examples (a corpus (plural, "corpora") is a set of documents, possibly 
        with human or computer annotations).Many different classes of machine-learning 
        algorithms have been applied to natural-language-processing tasks. These 
        algorithms take as input a large set of "features" that are generated from the 
        input data. Some of the earliest-used algorithms, such as decision trees, produced 
        systems of hard if-then rules similar to the systems of hand-written rules that were 
        then common. Increasingly, however, research has focused on statistical models, 
        which make soft, probabilistic decisions based on attaching real-valued weights 
        to each input feature. Such models have the advantage that they can express the 
        relative certainty of many different possible answers rather than only one, producing
        more reliable results when such a model is included as a component of a larger system.'''

# Tokenization

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
# word tokenization

tokens=nltk.word_tokenize(text)
print(type(tokens), '\nNumber of Tokens:', len(tokens))

<class 'list'> 
Number of Tokens: 255


In [0]:
tokens[:10]

['In',
 'the',
 'early',
 'days',
 ',',
 'many',
 'language-processing',
 'systems',
 'were',
 'designed']

In [0]:
tokens[-10:]

['is', 'included', 'as', 'a', 'component', 'of', 'a', 'larger', 'system', '.']

In [0]:
# sentence tokenization

sentences=nltk.sent_tokenize(text)
print(type(sentences), '\nNumber of Sentences:', len(sentences))

<class 'list'> 
Number of Sentences: 8


In [0]:
sentences[:3]

['In the early days, many language-processing systems were designed by hand-\n        coding a set of rules,[9],[10] e.g.',
 'by writing grammars or devising heuristic \n        rules for stemming.',
 'However, this is rarely robust to natural language variation.']

# Stopwords

In [0]:
# stopwords removal

from nltk.corpus import stopwords
nltk.download('stopwords')
stopword = set(stopwords.words('english'))
stopword

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [0]:
no_stop=''
for i in range(len(tokens)):
  if tokens[i] not in stopword:
    no_stop+=tokens[i]+' '
  else:
    print(tokens[i])
no_stop=no_stop.strip()

the
were
by
a
of
by
or
for
this
is
to
the
in
the
and
has
on
for
to
such
through
the
of
of
a
is
a
of
with
or
of
have
been
to
as
a
of
that
are
from
the
of
the
such
as
of
to
the
of
that
were
then
has
on
which
on
to
each
have
the
that
they
can
the
of
than
only
more
when
such
a
is
as
a
of
a


In [0]:
print('Length of text with stopwords:', len(text))
print('Length of text with no stop_words:', len(no_stop))

Length of text with stopwords: 1603
Length of text with no stop_words: 1218


# Stemming

In [0]:
# stemming
# reduce to root word

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stem_token=[]
for token in tokens:
    stem_token.append(stemmer.stem(token))

In [0]:
for i in range(len(tokens)):
  print(tokens[i],'-->',stem_token[i])

In --> In
the --> the
early --> earli
days --> day
, --> ,
many --> mani
language-processing --> language-process
systems --> system
were --> were
designed --> design
by --> by
hand- --> hand-
coding --> code
a --> a
set --> set
of --> of
rules --> rule
, --> ,
[ --> [
9 --> 9
] --> ]
, --> ,
[ --> [
10 --> 10
] --> ]
e.g --> e.g
. --> .
by --> by
writing --> write
grammars --> grammar
or --> or
devising --> devis
heuristic --> heurist
rules --> rule
for --> for
stemming --> stem
. --> .
However --> howev
, --> ,
this --> thi
is --> is
rarely --> rare
robust --> robust
to --> to
natural --> natur
language --> languag
variation --> variat
. --> .
Since --> sinc
the --> the
so-called --> so-cal
`` --> ``
statistical --> statist
revolution --> revolut
'' --> ''
[ --> [
11 --> 11
] --> ]
[ --> [
12 --> 12
] --> ]
in --> in
the --> the
late --> late
1980s --> 1980
and --> and
mid --> mid
1990s --> 1990
, --> ,
much --> much
natural --> natur
language --> languag
processing --> process
resea

# Lemmatization

In [0]:
# lemmatization
# converts to base word. 


from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

lemma_token=[]
for token in tokens:
    lemma_token.append(lemmatizer.lemmatize(token))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
for i in range(len(tokens)):
  print(tokens[i],'-->',lemma_token[i])

In --> In
the --> the
early --> early
days --> day
, --> ,
many --> many
language-processing --> language-processing
systems --> system
were --> were
designed --> designed
by --> by
hand- --> hand-
coding --> coding
a --> a
set --> set
of --> of
rules --> rule
, --> ,
[ --> [
9 --> 9
] --> ]
, --> ,
[ --> [
10 --> 10
] --> ]
e.g --> e.g
. --> .
by --> by
writing --> writing
grammars --> grammar
or --> or
devising --> devising
heuristic --> heuristic
rules --> rule
for --> for
stemming --> stemming
. --> .
However --> However
, --> ,
this --> this
is --> is
rarely --> rarely
robust --> robust
to --> to
natural --> natural
language --> language
variation --> variation
. --> .
Since --> Since
the --> the
so-called --> so-called
`` --> ``
statistical --> statistical
revolution --> revolution
'' --> ''
[ --> [
11 --> 11
] --> ]
[ --> [
12 --> 12
] --> ]
in --> in
the --> the
late --> late
1980s --> 1980s
and --> and
mid --> mid
1990s --> 1990s
, --> ,
much --> much
natural --> natural
langu

In [0]:
# lemmatization
# converts to base word. 
# with context


from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

lemma_token=[]
for token in tokens:
    lemma_token.append(lemmatizer.lemmatize(token,pos="v"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
for i in range(len(tokens)):
  print(tokens[i],'-->',lemma_token[i])

In --> In
the --> the
early --> early
days --> days
, --> ,
many --> many
language-processing --> language-processing
systems --> systems
were --> be
designed --> design
by --> by
hand- --> hand-
coding --> cod
a --> a
set --> set
of --> of
rules --> rule
, --> ,
[ --> [
9 --> 9
] --> ]
, --> ,
[ --> [
10 --> 10
] --> ]
e.g --> e.g
. --> .
by --> by
writing --> write
grammars --> grammars
or --> or
devising --> devise
heuristic --> heuristic
rules --> rule
for --> for
stemming --> stem
. --> .
However --> However
, --> ,
this --> this
is --> be
rarely --> rarely
robust --> robust
to --> to
natural --> natural
language --> language
variation --> variation
. --> .
Since --> Since
the --> the
so-called --> so-called
`` --> ``
statistical --> statistical
revolution --> revolution
'' --> ''
[ --> [
11 --> 11
] --> ]
[ --> [
12 --> 12
] --> ]
in --> in
the --> the
late --> late
1980s --> 1980s
and --> and
mid --> mid
1990s --> 1990s
, --> ,
much --> much
natural --> natural
language --> lang

# Frequency distribution

In [0]:
# frequency distribution of tokens

from nltk.probability import FreqDist
sorted(FreqDist(tokens).items(),key=lambda k:k[1], reverse=True)

[(',', 14),
 ('of', 11),
 ('the', 9),
 ('.', 8),
 ('a', 7),
 ('rules', 5),
 ('to', 5),
 ('[', 4),
 (']', 4),
 ('systems', 3),
 ('set', 3),
 ('is', 3),
 ('``', 3),
 ('statistical', 3),
 ("''", 3),
 ('on', 3),
 ('such', 3),
 ('algorithms', 3),
 ('as', 3),
 ('input', 3),
 ('that', 3),
 ('many', 2),
 ('were', 2),
 ('by', 2),
 ('or', 2),
 ('for', 2),
 ('natural', 2),
 ('language', 2),
 ('research', 2),
 ('has', 2),
 ('machine-learning', 2),
 ('large', 2),
 ('corpora', 2),
 ('(', 2),
 (')', 2),
 ('different', 2),
 ('have', 2),
 ('models', 2),
 ('In', 1),
 ('early', 1),
 ('days', 1),
 ('language-processing', 1),
 ('designed', 1),
 ('hand-', 1),
 ('coding', 1),
 ('9', 1),
 ('10', 1),
 ('e.g', 1),
 ('writing', 1),
 ('grammars', 1),
 ('devising', 1),
 ('heuristic', 1),
 ('stemming', 1),
 ('However', 1),
 ('this', 1),
 ('rarely', 1),
 ('robust', 1),
 ('variation', 1),
 ('Since', 1),
 ('so-called', 1),
 ('revolution', 1),
 ('11', 1),
 ('12', 1),
 ('in', 1),
 ('late', 1),
 ('1980s', 1),
 ('and', 1)