In [4]:
# Word and Sentence Tokenization

In [5]:
## Sentence tokenization

In [2]:
#Word and sentence tokenization library 
from nltk.tokenize import sent_tokenize, word_tokenize

#Text for tokenization
text = """Hello Mr. Smith, how are you doing today?
                The weather is great and Python is awesome.
                The sky is pinkish-blue. You should not ept cardboard."""

#Print the Sentence level toknization
print(sent_tokenize(text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great and Python is awesome.', 'The sky is pinkish-blue.', 'You should not ept cardboard.']


In [3]:
#To print each sentence in a new line
for sent in sent_tokenize(text):
    print(sent)

Hello Mr. Smith, how are you doing today?
The weather is great and Python is awesome.
The sky is pinkish-blue.
You should not ept cardboard.


In [6]:
## Word Tokenization

In [7]:
#Print the Word level toknization
print(word_tokenize(text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', 'not', 'ept', 'cardboard', '.']


In [8]:
#To print each Word in a new line
for word in word_tokenize(text):
    print(word)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
Python
is
awesome
.
The
sky
is
pinkish-blue
.
You
should
not
ept
cardboard
.


In [9]:
## Stop words removal

In [10]:
from nltk.corpus import stopwords #Stop word library 
from nltk.tokenize import word_tokenize #Word tokenization library 

#Text for tokenization
text = """This is an example showing off stop word filteration."""

#English languge stop words  
stop_words = set(stopwords.words("english"))

print(stop_words)

{'too', 'them', 'or', 'hadn', 'now', 'had', 'yours', 'herself', "should've", 'i', 'her', 'between', "you're", 'doesn', 'd', 'because', 'were', 'nor', 'there', 've', 'against', "isn't", 'me', "that'll", 'which', 'these', 'has', "hadn't", 'can', 'm', 'aren', 'below', 'that', 'y', 'yourself', 'doing', 'each', 'weren', 'if', 'himself', 'by', "needn't", "don't", 'you', 'up', 'myself', 'yourselves', "it's", 'of', 'few', 'our', 'after', 'does', 'needn', 'ourselves', 'same', 'did', 'couldn', 'why', 'some', 'she', 'he', "you'd", 'while', "doesn't", 'for', 'both', 'hasn', 'as', 'before', 'hers', 'those', 'it', 'ain', 'how', "haven't", 'what', 'through', 'who', 'having', 'no', "won't", 'a', 'than', 'didn', 'very', 't', 'any', 'into', 'was', 'such', 'have', 'their', "wouldn't", "mightn't", 'isn', 'his', "weren't", 'here', 'more', 'this', 're', 'they', 'then', "shouldn't", 'your', 'itself', 'is', 'are', 'again', 's', "couldn't", 'about', 'will', 'and', 'over', 'so', 'do', "aren't", 'wasn', "you'll"

In [11]:
len(stop_words)

179

In [13]:
#Removing stop words from the given text 
words = word_tokenize(text)

words_without_stopword = []

for word in words:
    if word not in stop_words:
        words_without_stopword.append(word)
print("List of words before removal of stop words:\n ", words) 
print("=====================================================================")
print("List of words after removal of stop words: \n ", words_without_stopword)

List of words before removal of stop words:
  ['This', 'is', 'an', 'example', 'showing', 'off', 'stop', 'word', 'filteration', '.']
List of words after removal of stop words: 
  ['This', 'example', 'showing', 'stop', 'word', 'filteration', '.']


In [14]:
print("Number of words before stop word removal is: ", len(words))
print("Number of words after stop word removal is: ", len(words_without_stopword))

Number of words before stop word removal is:  10
Number of words after stop word removal is:  7


In [15]:
#Stop words removal in different way
sentence_without_stopword = [word for word in words if not word in stop_words]
print(sentence_without_stopword)

['This', 'example', 'showing', 'stop', 'word', 'filteration', '.']


In [16]:
print("Number of words after stop word removal is: ", len(sentence_without_stopword))

Number of words after stop word removal is:  7


In [17]:
## Using Stemmer to stem words

In [18]:
from nltk.stem import PorterStemmer #Porter Stemmer  
from nltk.tokenize import word_tokenize #Word tokenization library 

ps = PorterStemmer() #Porter Stemmer object  

#List of words to stem
words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

for word in words:
    print(ps.stem(word))#applying the stem object on the list of words 

python
python
python
python
pythonli


In [19]:
text = """It is very important to be pythonly while you are pythoning with python.
           All pythoners have pythoned poorly at least onec."""

words = word_tokenize(text)

for word in words:
    print(ps.stem(word))#applying the stem object on the text

it
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onec
.


In [20]:
## Part of Speech Tagging

In [21]:
import nltk
from nltk.corpus import state_union

from nltk.tokenize import PunktSentenceTokenizer #Special type of tokenizer 

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

#To print GWBush 2005 speech
print(train_text[:500])

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territo


In [22]:
#To print GWBush 2006 speech
print(sample_text[:500])

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 31, 2006

THE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the hus


In [23]:
train_text_short = train_text[:500]#Short form of the train text
print(train_text_short)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territo


In [24]:
sample_text_short = sample_text[:500]#Short form of the sample text
print(sample_text_short)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 31, 2006

THE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the hus


In [25]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text_short)
tokenized = custom_sent_tokenizer.tokenize(sample_text_short)

#Part of speech tagging
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
        
    except Exception as e:
        print(str(e))
        
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [26]:
## Part of Speech Tagging using Chunk

In [27]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

train_text_short = train_text[:500]
sample_text_short = sample_text[:500]

custom_sent_tokenizer = PunktSentenceTokenizer(train_text_short)
tokenized = custom_sent_tokenizer.tokenize(sample_text_short)

#Part of speech tagging using chunk
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
        
    except Exception as e:
        print(str(e))
        
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

train_text_short = train_text[:500]
sample_text_short = sample_text[:500]

custom_sent_tokenizer = PunktSentenceTokenizer(train_text_short)
tokenized = custom_sent_tokenizer.tokenize(sample_text_short)

#Part of speech tagging using chunk
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            chunked.draw()
        
    except Exception as e:
        print(str(e))
        
process_content()

In [None]:
## Part of Speech Tagging using Chinking

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

train_text_short = train_text[:500]
sample_text_short = sample_text[:500]

custom_sent_tokenizer = PunktSentenceTokenizer(train_text_short)
tokenized = custom_sent_tokenizer.tokenize(sample_text_short)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}
                        }<VB.?|IN|DT|TO>{
                        """
                        
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            chunked.draw
        
    except Exception as e:
        print(str(e))
        
process_content()

In [None]:
## Named Entity Recognition

In [None]:
###Name Entity
####Name Entity Examples:

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

train_text_short = train_text[:500]
sample_text_short = sample_text[:500]

custom_sent_tokenizer = PunktSentenceTokenizer(train_text_short)
tokenized = custom_sent_tokenizer.tokenize(sample_text_short)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            nameEnt = nltk.ne_chunk(tagged, binary="True")
            
            nameEnt.draw()
        
    except Exception as e:
        print(str(e))
        
process_content()

In [None]:
##Lemmatization

In [1]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))

cat


In [2]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))


print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run", 'v'))

better
good
best
run
run


In [1]:
#Stemming Vs Lemmatizing 
#Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

words = ["history", "historical", "finally", "final", "finalized"]

for word in words:
    print(ps.stem(word))

histori
histor
final
final
final


In [2]:
#Stemming Vs Lemmatizing
#Lemmatizating
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ["history", "historical", "finally", "final", "finalized"]

for word in words:
    print(lemmatizer.lemmatize(word))

history
historical
finally
final
finalized


In [3]:
#NLTK Corpora

In [4]:
import nltk

print(nltk.__file__)

C:\Users\ShadowX9\anaconda3\lib\site-packages\nltk\__init__.py


In [5]:
#To go to the corpora location
%appdata%
#Go to "nltk_data"

UsageError: Line magic function `%appdata%` not found.


In [6]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)
print(tok[5:10])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.']


In [7]:
##WordNet

In [8]:
from nltk.corpus import wordnet

syns = wordnet.synsets("program")

print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [9]:
print(syns[0])

Synset('plan.n.01')


In [10]:
print(syns[0].lemmas()[0])

Lemma('plan.n.01.plan')


In [11]:
print(syns[0].lemmas()[0].name())

plan


In [12]:
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [13]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [14]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print("List of Synonyms:\n", set(synonyms))
print("List of Antonyms:\n", set(antonyms))

List of Synonyms:
 {'right', 'adept', 'goodness', 'just', 'estimable', 'ripe', 'in_force', 'soundly', 'sound', 'unspoiled', 'full', 'effective', 'well', 'expert', 'dear', 'unspoilt', 'thoroughly', 'upright', 'beneficial', 'dependable', 'serious', 'trade_good', 'respectable', 'good', 'secure', 'undecomposed', 'practiced', 'commodity', 'skilful', 'honest', 'in_effect', 'near', 'proficient', 'salutary', 'honorable', 'safe', 'skillful'}
List of Antonyms:
 {'ill', 'bad', 'badness', 'evilness', 'evil'}


In [15]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

0.9090909090909091


In [16]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))

0.6956521739130435


In [17]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))

0.32


In [18]:
w1 = wordnet.synset("kitten.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))

0.5833333333333334
