In [1]:
# Installation of nltk
#In Jupyter, the console commands can be executed by the ‘!’ sign before the command within the cell
!pip install nltk



### Text Preprocessing 
Following code can be used for text preprocessing useful for various NLP applications.

First we need to import nltk

For a given text, we can do sentence tokenization and word tokenization using nltk library functions.
We can remove the punctuations using string library.

We can then remove stop words in English to get the important words in the text.

We also perform stemming and lemmatization. Stemming and Lemmatization are two different techniques that help reduce our data space. We don’t need to check every single form of a word for reducing the size of the big data corpus.

In [2]:
#import nltk library for using its different functions
import nltk
import string
import re


In [3]:
#  Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize

In [4]:
# Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

In [5]:
statement = "Hello all, My name is sariya. Welcome to the basic text processing of Natural Language Processing(NLP). NLP is a very interesting area of data science."

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
sentences = sent_tokenize(statement)
print(sentences)
words = word_tokenize(statement)
print(words)

['Hello all, My name is sariya.', 'Welcome to the basic text processing of Natural Language Processing(NLP).', 'NLP is a very interesting area of data science.']
['Hello', 'all', ',', 'My', 'name', 'is', 'sariya', '.', 'Welcome', 'to', 'the', 'basic', 'text', 'processing', 'of', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', '.', 'NLP', 'is', 'a', 'very', 'interesting', 'area', 'of', 'data', 'science', '.']


In [8]:
for sentence in sentences:
    print(sentence)

Hello all, My name is sariya.
Welcome to the basic text processing of Natural Language Processing(NLP).
NLP is a very interesting area of data science.


In [9]:
for word in words:
    print(word)

Hello
all
,
My
name
is
sariya
.
Welcome
to
the
basic
text
processing
of
Natural
Language
Processing
(
NLP
)
.
NLP
is
a
very
interesting
area
of
data
science
.


In [10]:
 # Remove punctuations
for word in words:
    if word not in string.punctuation:
        print(word)

Hello
all
My
name
is
sariya
Welcome
to
the
basic
text
processing
of
Natural
Language
Processing
NLP
NLP
is
a
very
interesting
area
of
data
science


In [11]:
only_words=[w for w in words if not w in string.punctuation]
print(only_words)

['Hello', 'all', 'My', 'name', 'is', 'sariya', 'Welcome', 'to', 'the', 'basic', 'text', 'processing', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area', 'of', 'data', 'science']


In [12]:
#Removal of stop words from the text
from nltk.corpus import stopwords

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# List of English stop words 
english_stop_words=set(stopwords.words("english"))
print(english_stop_words)

{'nor', 'shan', 'its', "couldn't", 'from', 'all', 'shouldn', 'having', 'the', 'won', 'is', 'yours', "you'd", 'if', 'wouldn', 'no', 'you', 'while', 'against', 'isn', 'above', 'will', 'not', 'before', 'hers', 'ain', 'only', 're', 'their', 'mightn', 'myself', 'does', 'me', 'y', 'or', 'then', 'as', 'should', 'an', 'do', "you'll", 'below', 'very', 'into', 'ours', "hadn't", 'he', 'have', 'i', 'at', 'can', 'here', 'most', 'wasn', 'who', "should've", 'had', 'to', 'off', "doesn't", "aren't", 'which', "you've", 'few', 'during', 'm', 'each', 'our', "needn't", 'there', 'haven', 'mustn', 'am', 'on', 'with', 'than', 'until', "don't", "wasn't", 't', "hasn't", 'after', 'some', 'ma', 'his', 's', 'now', "mightn't", 'other', 'but', 'doing', 'out', 'd', 'a', 'we', 'how', "won't", 'be', 'are', 'why', 'those', "it's", 'herself', 'themselves', "shouldn't", 'just', 'whom', 'couldn', "didn't", 'itself', 'own', 'yourselves', 'that', 'again', 'needn', "wouldn't", 'this', 'between', 'in', 'too', 'don', 'about', '

In [15]:
# Removal of stop words from the text
keywords=[w for w in only_words if not w in english_stop_words]
print(keywords)

['Hello', 'My', 'name', 'sariya', 'Welcome', 'basic', 'text', 'processing', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'interesting', 'area', 'data', 'science']


### Lemmatization

Lemmatization in NLP is the process through which several different forms of the same word are mapped to one single form, which we can call the root form or the base form. In more technical terms, the root form is called a lemma. By reducing the number of forms a word can take, we make sure that we reduce our data space and that we don’t have to check every single form of a word. It helps us ignore morphological variations on a single word. Lemmatization brings context to the words.So it goes a steps further by linking words with similar meaning to one word. For example if a paragraph has words like cars, trains and automobile, then it will link all of them to automobile. In the below program we use the WordNet lexical database for lemmatization.

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
keywords=[w for w in only_words if w in wordnet_lemmatizer.lemmatize(w)]
print(keywords)
# Next find the roots of the word
# for w in keywords:
#     lemmatized_words=wordnet_lemmatizer.lemmatize(w)
#     print(wordnet_lemmatizer.lemmatize(w))

['Hello', 'all', 'My', 'name', 'is', 'sariya', 'Welcome', 'to', 'the', 'basic', 'text', 'processing', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area', 'of', 'data', 'science']


### Stemming

Stemming in NLP is the process of removing prefixes and suffixes from words so that they are reduced to simpler forms which are called stems. The purpose of stemming is to reduce our vocabulary and dimensionality for NLP tasks and to improve speed and efficiency in information retrieval and information processing tasks. Stemming is a simpler, faster process than lemmatization. The difference is that stemming is usually only rule-based approach. And, as we've showed with our earlier example, rule-based approaches can fail very quickly on more complex examples. But for most problems, it works well enough. Many search engines use stemming to improve their search results.


In [18]:
# Stemming
from nltk.stem import PorterStemmer

In [19]:
porter_stemmer = PorterStemmer()
# First Word tokenization
nltk_tokens = nltk.word_tokenize(statement)
#Next find the roots of the word
for w in keywords:
       print(porter_stemmer.stem(w))

hello
all
My
name
is
sariya
welcom
to
the
basic
text
process
of
natur
languag
process
nlp
nlp
is
a
veri
interest
area
of
data
scienc


In [20]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [21]:
# POS Tagging

In [22]:
print(nltk.pos_tag(keywords))

[('Hello', 'NNP'), ('all', 'DT'), ('My', 'NNP'), ('name', 'NN'), ('is', 'VBZ'), ('sariya', 'JJ'), ('Welcome', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('basic', 'JJ'), ('text', 'NN'), ('processing', 'NN'), ('of', 'IN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('NLP', 'NNP'), ('NLP', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('interesting', 'JJ'), ('area', 'NN'), ('of', 'IN'), ('data', 'NNS'), ('science', 'NN')]
