In [1]:
################################ Natural Language Processing (NLP) #############################
# NLP is about developing applications and services that are able to understand human languages. 
# Google voice search, Grammer correction, chatbots,  sentiment analysis re examples of fields needs NLP

In [2]:
############################## NLTK : NLP ToolKit in Python ###################################
# The most powerful library for NLP 
# First, install nltk and download all its packages and dataset using following commands 
# (p.s. you need an Internet connection, a small pop-up window appear --> click download and wait)
import nltk
# nltk.download()
# in case you don't wanna wait, just make sre to download the following packages
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [3]:
# test
from nltk.corpus import brown # a common dataset
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
######################### Tokenization ######################################
# the process by which big quantity of text is divided into smaller parts called tokens, such as words or sentences
# these tokens are used afterwards to detect and find specific patterns in text
# The required module is 'tokenize' (i.e. import nltk.tokenize)
# tokenize is further compromises of sub-modules: (1) word tokenize and (2) sentence tokenize

In [5]:
### (1) word tokenize
# method word_tokenize() splits a sentence into words
# example use: word frequency, converting txt into numerics 
from nltk.tokenize import word_tokenize

text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""

print(word_tokenize(text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


In [6]:
# Frequency Distribution example
from nltk.probability import FreqDist
fdist = FreqDist(word_tokenize(text))
print(fdist);
fdist.most_common(2);

<FreqDist with 25 samples and 30 outcomes>


In [7]:
# Frequancy Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(25) # 25 is the number of samples or words
plt.show()

<Figure size 640x480 with 1 Axes>

In [8]:
### (2) senetence tokenize
# method sent_tokenize() splits a text into sentences
# example use: counting frequency of words per senternce

from nltk.tokenize import sent_tokenize

text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""

print(sent_tokenize(text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]


In [9]:
############# Stop Words #######################
# It's a good practice to clean text from stop words
# If you'r not familiar with stop words, run the following code
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
print(stop_words)

{'such', "you've", 'me', "shan't", "mustn't", 'there', 'doing', 'for', 'now', 'while', 'over', 'to', 'between', 'and', 'hers', 'o', 'too', "aren't", 'above', 'a', 'where', 've', 'was', "don't", 'who', 'up', 'shouldn', 'isn', "should've", 'until', 'just', 'my', 'from', 'why', 'being', 'mightn', 'himself', 'during', 'then', 'into', 'that', 'all', 'below', 'further', "you'll", 'any', 'each', 'myself', 'because', 'were', 'didn', 'he', 'her', 'itself', "wouldn't", 'does', "won't", "isn't", 'under', 're', 'has', 'at', 'will', "you're", 'more', 'this', 'some', 'll', 'been', 'hasn', 'needn', 'their', 'only', 'did', 'the', 'doesn', 'you', 'by', 'wasn', 'off', 'ours', 'him', 'down', 'yours', 't', 'they', 'so', 'very', 'both', 'hadn', 'theirs', 'are', 'out', "that'll", 'am', 'same', 'have', 'do', 'it', 'about', 'couldn', "mightn't", 'but', 'with', 'no', 'its', "needn't", 'those', 'in', 'we', 'haven', 'your', 'an', "it's", 'how', 'don', 'aren', "haven't", 'again', 'when', 'through', 'should', 'his

In [10]:
# Now, yo can clear tokenized words from stop words to work with only intersting ones
t_words = word_tokenize(text)
filtered_words = []
for w in t_words:
    if w not in stop_words:
        filtered_words.append(w)
print("Tokenized Sentence: ", t_words)
print("Filtered Sentence: ", filtered_words)

Tokenized Sentence:  ['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']
Filtered Sentence:  ['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'The', 'weather', 'great', ',', 'city', 'awesome', '.', 'The', 'sky', 'pinkish-blue', '.', 'You', "n't", 'eat', 'cardboard']


In [11]:
########################### POS Tagging (Part-Of-Speech) #####################################
# read the text in a language and assigning some specific token (Parts of Speech) to each word.
# Example::
# Input: Everything to permit us.
# Output: [('Everything', NN),('to', TO), ('permit', VB), ('us', PRP)]
# for a list of abbreviatio look at: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [12]:
# import pos_tag module
# get a set of word_tokenize, then apply pos_tag() method
from nltk import pos_tag

token_tags = pos_tag(filtered_words)
print(token_tags)

[('Hello', 'NNP'), ('Mr.', 'NNP'), ('Smith', 'NNP'), (',', ','), ('today', 'NN'), ('?', '.'), ('The', 'DT'), ('weather', 'NN'), ('great', 'JJ'), (',', ','), ('city', 'NN'), ('awesome', 'NN'), ('.', '.'), ('The', 'DT'), ('sky', 'JJ'), ('pinkish-blue', 'NN'), ('.', '.'), ('You', 'PRP'), ("n't", 'RB'), ('eat', 'VB'), ('cardboard', 'NN')]


In [13]:
############################## Chunking ######################################
# add more structure to the sentence by following parts of speech (POS) tagging
# For example, you need to tag Noun, verb (past tense), adjective, and coordinating junction from the sentence. 
from nltk import RegexpParser
pattern = """mychunk:{<NN.?>*<VBD.?>*<NNP.?>*<VB.?>*}"""
chunker = RegexpParser(pattern)
chunkes_words = chunker.parse(token_tags[:15])
print(chunkes_words)
chunkes_words.draw()

(S
  (mychunk Hello/NNP Mr./NNP Smith/NNP)
  ,/,
  (mychunk today/NN)
  ?/.
  The/DT
  (mychunk weather/NN)
  great/JJ
  ,/,
  (mychunk city/NN awesome/NN)
  ./.
  The/DT
  sky/JJ)


In [14]:
######################### Stemming ##############################
# reduces words to their word root word
# Example: the root word is "eat" and it's variations are "eats, eating, eaten and like so".
# "PorterStemmer": This algorithm accepts the list of tokenized word and stems it into root word.

from nltk.stem import PorterStemmer
words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in words:
    rootWord=ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


In [15]:
# let's try on our t_words
for w in filtered_words:
    rootWord=ps.stem(w)
    print(rootWord)

hello
mr.
smith
,
today
?
the
weather
great
,
citi
awesom
.
the
sky
pinkish-blu
.
you
n't
eat
cardboard


In [16]:
############################### Lemmatization ###############################
# finding the lemma of a word depending on their meaning.
# lemmatization implies doing proper reduction to the dictionary
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
for w in filtered_words:
    print(lem.lemmatize(w))

Hello
Mr.
Smith
,
today
?
The
weather
great
,
city
awesome
.
The
sky
pinkish-blue
.
You
n't
eat
cardboard


In [17]:
########### Stemming vs. Lemmatization ##################
# Stemming algorithm works by cutting the suffix from the word, cuts either the beginning or end of the word.
# Lemmatization takes into consideration morphological analysis of the words. 

text = "studies studying cries cry"
tokenization = word_tokenize(text)

for w in tokenization:
    print("Stemming for {} is {}".format(w,ps.stem(w)))  
    print("Lemma for {} is {}\n------------".format(w, lem.lemmatize(w)))


Stemming for studies is studi
Lemma for studies is study
------------
Stemming for studying is studi
Lemma for studying is studying
------------
Stemming for cries is cri
Lemma for cries is cry
------------
Stemming for cry is cri
Lemma for cry is cry
------------
