In [10]:
# Installation of nltk
 #In Jupyter, the console commands can be executed by the ‘!’ sign before the command within the cell
!pip install nltk



### Text Preprocessing 
Following code can be used for text preprocessing useful for various NLP applications.

First we need to import nltk

For a given text, we can do sentence tokenization and word tokenization using nltk library functions.
We can remove the punctuations using string library.

We can then remove stop words in English to get the important words in the text.

We also perform stemming and lemmatization. Stemming and Lemmatization are two different techniques that help reduce our data space. We don’t need to check every single form of a word for reducing the size of the big data corpus.

In [11]:
#import nltk library for using its different functions
import nltk
import string
import re

In [12]:
#  Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize

In [13]:
# Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

In [17]:
statement = "Hello all, I am Dr. Chetana. Welcome to the lab session of Natural Language Processing(NLP). NLP is a very interesting area."

In [18]:
sentences = sent_tokenize(statement)
print(sentences)
words = word_tokenize(statement)
print(words)

['Hello all, I am Dr. Chetana.', 'Welcome to the lab session of Natural Language Processing(NLP).', 'NLP is a very interesting area.']
['Hello', 'all', ',', 'I', 'am', 'Dr.', 'Chetana', '.', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', '.', 'NLP', 'is', 'a', 'very', 'interesting', 'area', '.']


In [19]:
for sentence in sentences:
    print(sentence)

Hello all, I am Dr. Chetana.
Welcome to the lab session of Natural Language Processing(NLP).
NLP is a very interesting area.


In [20]:
for word in words:
    print(word)

Hello
all
,
I
am
Dr.
Chetana
.
Welcome
to
the
lab
session
of
Natural
Language
Processing
(
NLP
)
.
NLP
is
a
very
interesting
area
.


In [21]:
 # Remove punctuations
for word in words:
    if word not in string.punctuation:
        print(word)

Hello
all
I
am
Dr.
Chetana
Welcome
to
the
lab
session
of
Natural
Language
Processing
NLP
NLP
is
a
very
interesting
area


In [22]:
only_words=[w for w in words if not w in string.punctuation]
print(only_words)

['Hello', 'all', 'I', 'am', 'Dr.', 'Chetana', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area']


In [23]:
#Removal of stop words from the text
from nltk.corpus import stopwords

In [24]:
# List of English stop words 
english_stop_words=set(stopwords.words("english"))
print(english_stop_words)

{'my', 'd', 've', 'they', 'under', 'did', 'theirs', 'can', "that'll", "needn't", 'ourselves', 'into', 'because', 'nor', 'own', 'too', 'haven', 'only', 'other', "you'd", "it's", 'while', 'or', 'she', 'to', 'its', 'why', 'those', 'aren', 'on', 'isn', 'mightn', 'hasn', 'is', 'out', 'o', 'over', 'once', "you've", 'he', 'him', 'further', 'such', 'ma', 'by', 't', 'ours', 'has', 'who', 'their', 'then', "mustn't", 'yourself', 'me', 'being', 'all', 'between', 'you', 'have', "won't", "she's", 'when', 'hers', 'couldn', "isn't", "hadn't", 'we', 'at', 'his', 'mustn', "shan't", 'up', "wasn't", 'shouldn', 'this', 'will', 'about', 'now', 'in', 'weren', 'than', 'do', 'it', "should've", 'them', "you're", 'yourselves', 'before', "you'll", 'yours', 'himself', 'any', "weren't", 'below', 'a', 'be', 'with', "hasn't", 'but', 'll', 'wouldn', 'y', 'ain', 'doing', 'as', 'few', 'was', 'which', 'your', 'our', 'until', 'these', 'hadn', 'from', 're', 'are', 'same', 'wasn', 's', 'and', 'if', 'her', "shouldn't", 'both

In [25]:
# Removal of stop words from the text
keywords=[w for w in only_words if not w in english_stop_words]
print(keywords)

['Hello', 'I', 'Dr.', 'Chetana', 'Welcome', 'lab', 'session', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'interesting', 'area']


### Lemmatization

Lemmatization in NLP is the process through which several different forms of the same word are mapped to one single form, which we can call the root form or the base form. In more technical terms, the root form is called a lemma. By reducing the number of forms a word can take, we make sure that we reduce our data space and that we don’t have to check every single form of a word. It helps us ignore morphological variations on a single word. Lemmatization brings context to the words.So it goes a steps further by linking words with similar meaning to one word. For example if a paragraph has words like cars, trains and automobile, then it will link all of them to automobile. In the below program we use the WordNet lexical database for lemmatization.

In [36]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
keywords=[w for w in only_words if w in wordnet_lemmatizer.lemmatize(w)]
print(keywords)
#Next find the roots of the word
##for w in keywords:
    #lemmatized_words=wordnet_lemmatizer.lemmatize(w)
    #print(wordnet_lemmatizer.lemmatize(w))

['Hello', 'all', 'I', 'am', 'Dr.', 'Chetana', 'Welcome', 'to', 'the', 'lab', 'session', 'of', 'Natural', 'Language', 'Processing', 'NLP', 'NLP', 'is', 'a', 'very', 'interesting', 'area']


### Stemming

Stemming in NLP is the process of removing prefixes and suffixes from words so that they are reduced to simpler forms which are called stems. The purpose of stemming is to reduce our vocabulary and dimensionality for NLP tasks and to improve speed and efficiency in information retrieval and information processing tasks. Stemming is a simpler, faster process than lemmatization. The difference is that stemming is usually only rule-based approach. And, as we've showed with our earlier example, rule-based approaches can fail very quickly on more complex examples. But for most problems, it works well enough. Many search engines use stemming to improve their search results.


In [27]:
# Stemming
from nltk.stem import PorterStemmer

In [28]:
porter_stemmer = PorterStemmer()
# First Word tokenization
nltk_tokens = nltk.word_tokenize(statement)
#Next find the roots of the word
for w in keywords:
       print(porter_stemmer.stem(w))

hello
I
dr.
chetana
welcom
lab
session
natur
languag
process
nlp
nlp
interest
area


In [29]:
# POS Tagging

In [38]:
print(nltk.pos_tag(keywords))

[('Hello', 'NNP'), ('all', 'DT'), ('I', 'PRP'), ('am', 'VBP'), ('Dr.', 'NNP'), ('Chetana', 'NNP'), ('Welcome', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('lab', 'NN'), ('session', 'NN'), ('of', 'IN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('NLP', 'NNP'), ('NLP', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('interesting', 'JJ'), ('area', 'NN')]
