# NLTK Introduction

# About NLTK library

- NLTK is a leading platform for building Python programs to work with human language data
- https://www.nltk.org/index.html

# NLTK library installation

In [1]:
# !pip install nltk

# Some vocabulary for NLP

In [1]:
import nltk as nlp
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer,RegexpStemmer
import matplotlib.pyplot as plt
import re

# NLTK version

In [2]:
nlp.__version__

'3.6.5'

# How to download nltk corpus data

In [5]:
# nlp.download()

# Sample Text

In [3]:
txt = """Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[6] German: [ˈalbɛʁt ˈʔaɪnʃtaɪn] (audio speaker iconlisten); 14 March 1879 – 18 April 1955) was a German-born theoretical physicist,[7] widely acknowledged to be one of the greatest physicists of all time. Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics. Relativity and quantum mechanics are together the two pillars of modern physics.[3][8] His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been dubbed "the world's most famous equation".[9] His work is also known for its influence on the philosophy of science.[10][11] He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect",[12] a pivotal step in the development of quantum theory. His intellectual achievements and originality resulted in "Einstein" becoming synonymous with "genius".[13]
"""

In [4]:
print(txt)

Albert Einstein (/ˈaɪnstaɪn/ EYEN-styne;[6] German: [ˈalbɛʁt ˈʔaɪnʃtaɪn] (audio speaker iconlisten); 14 March 1879 – 18 April 1955) was a German-born theoretical physicist,[7] widely acknowledged to be one of the greatest physicists of all time. Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics. Relativity and quantum mechanics are together the two pillars of modern physics.[3][8] His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been dubbed "the world's most famous equation".[9] His work is also known for its influence on the philosophy of science.[10][11] He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect",[12] a pivotal step in the development of quantum theory. His intellectual achievements and originality resulted in "Einstein" becom

# Part-5

# NLTK Corpora
Huge database of speech, ratings etc.

In [6]:
from nltk.corpus import gutenberg

###### Tokenize as sentence

In [10]:
t1 = gutenberg.raw("bible-kjv.txt")
t2 = sent_tokenize(t1)
print(t2[:3])

['[The King James Bible]\n\nThe Old Testament of the King James Bible\n\nThe First Book of Moses:  Called Genesis\n\n\n1:1 In the beginning God created the heaven and the earth.', '1:2 And the earth was without form, and void; and darkness was upon\nthe face of the deep.', 'And the Spirit of God moved upon the face of the\nwaters.']


###### Read first 3 lines

In [11]:
for sent in t2[:3]:
    print(sent)

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.


# WordNet
To get Synonyms,Antonyms,definition and context of that word

Ex: `survey.n.01`

- Word — the word you are searching for.
- Part of Speech (POS) — a particular part of speech (noun, verb, adjective, adverb, pronoun, preposition, conjunction, interjection, numeral, article, or determiner), in which a word corresponds to based on both its definition and its context.
- NN — a sense key. A word can have multiple meanings or definitions. Therefore, “survey.n.01” is the first noun sense of the word “survey”.



In [12]:
from nltk.corpus import wordnet

###### List all the possible meanings of given word, like "good"

In [13]:
syn = wordnet.synsets("good")
syn

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

###### Get Lemmas of one meaning

In [14]:
for s in syn:
    print(s.lemmas())

[Lemma('good.n.01.good')]
[Lemma('good.n.02.good'), Lemma('good.n.02.goodness')]
[Lemma('good.n.03.good'), Lemma('good.n.03.goodness')]
[Lemma('commodity.n.01.commodity'), Lemma('commodity.n.01.trade_good'), Lemma('commodity.n.01.good')]
[Lemma('good.a.01.good')]
[Lemma('full.s.06.full'), Lemma('full.s.06.good')]
[Lemma('good.a.03.good')]
[Lemma('estimable.s.02.estimable'), Lemma('estimable.s.02.good'), Lemma('estimable.s.02.honorable'), Lemma('estimable.s.02.respectable')]
[Lemma('beneficial.s.01.beneficial'), Lemma('beneficial.s.01.good')]
[Lemma('good.s.06.good')]
[Lemma('good.s.07.good'), Lemma('good.s.07.just'), Lemma('good.s.07.upright')]
[Lemma('adept.s.01.adept'), Lemma('adept.s.01.expert'), Lemma('adept.s.01.good'), Lemma('adept.s.01.practiced'), Lemma('adept.s.01.proficient'), Lemma('adept.s.01.skillful'), Lemma('adept.s.01.skilful')]
[Lemma('good.s.09.good')]
[Lemma('dear.s.02.dear'), Lemma('dear.s.02.good'), Lemma('dear.s.02.near')]
[Lemma('dependable.s.04.dependable'), Lem

In [18]:
for s in syn:
    print(s.lemmas()[0])

Lemma('good.n.01.good')
Lemma('good.n.02.good')
Lemma('good.n.03.good')
Lemma('commodity.n.01.commodity')
Lemma('good.a.01.good')
Lemma('full.s.06.full')
Lemma('good.a.03.good')
Lemma('estimable.s.02.estimable')
Lemma('beneficial.s.01.beneficial')
Lemma('good.s.06.good')
Lemma('good.s.07.good')
Lemma('adept.s.01.adept')
Lemma('good.s.09.good')
Lemma('dear.s.02.dear')
Lemma('dependable.s.04.dependable')
Lemma('good.s.12.good')
Lemma('good.s.13.good')
Lemma('effective.s.04.effective')
Lemma('good.s.15.good')
Lemma('good.s.16.good')
Lemma('good.s.17.good')
Lemma('good.s.18.good')
Lemma('good.s.19.good')
Lemma('good.s.20.good')
Lemma('good.s.21.good')
Lemma('well.r.01.well')
Lemma('thoroughly.r.02.thoroughly')


In [19]:
for s in syn:
    print(s.lemmas()[0].name())

good
good
good
commodity
good
full
good
estimable
beneficial
good
good
adept
good
dear
dependable
good
good
effective
good
good
good
good
good
good
good
well
thoroughly


###### To get the meaning of each synset

In [21]:
syn[0].definition()

'benefit'

###### To get the example for that meaning

In [29]:
for s in syn:
    print(s.examples())

['for your own good', "what's the good of worrying?"]
['there is much good to be found in people']
['weigh the good against the bad', 'among the highest goods of all are happiness and self-realization']
[]
['good news from the hospital', 'a good report card', 'when she was good she was very very good', 'a good knife is one good for cutting', 'this stump will make a good picnic table', 'a good check', 'a good joke', 'a good exterior paint', 'a good secretary', 'a good dress for the office']
['gives full measure', 'gives good measure', 'a good mile from here']
[]
['all respectable companies give guarantees', "ruined the family's good name"]
['an arms limitation agreement beneficial to all countries', 'the beneficial effects of a temperate climate', 'the experience was good for her']
['we all had a good time', 'good manners']
['a genuinely good person', 'a just cause', 'an upright and respectable man']
['adept in handicrafts', 'an adept juggler', 'an expert job', 'a good mechanic', 'a pra

# Filter Synonyms and Antonyms from Lemmas

In [35]:
for s in syn:
    for l in s.lemmas():
        if l.antonyms():
            print(l.antonyms())

[Lemma('evil.n.03.evil')]
[Lemma('evil.n.03.evilness')]
[Lemma('bad.n.01.bad')]
[Lemma('bad.n.01.badness')]
[Lemma('bad.a.01.bad')]
[Lemma('evil.a.01.evil')]
[Lemma('ill.r.01.ill')]


In [39]:
for s in syn:
    for l in s.lemmas():
        if l.antonyms():
            print(l.antonyms()[0].name())

evil
evilness
bad
badness
bad
evil
ill


In [42]:
for s in syn:
    for l in s.lemmas():
        if l.antonyms():
            print("Antonym for {} is --> {}".format(l.name(),l.antonyms()[0].name()))

Antonym for good is --> evil
Antonym for goodness is --> evilness
Antonym for good is --> bad
Antonym for goodness is --> badness
Antonym for good is --> bad
Antonym for good is --> evil
Antonym for well is --> ill


# Finding word similarity using wordnet

In [43]:
w1 = wordnet.synset("dog.n.01")
w2 = wordnet.synset("cat.n.01")

In [44]:
print(w1)
print(w2)

Synset('dog.n.01')
Synset('cat.n.01')


In [45]:
w1.wup_similarity(w2)

0.8571428571428571

In [48]:
round(w1.wup_similarity(w2)*100,2)

85.71

In [49]:
w1 = wordnet.synset("horse.n.01")
w2 = wordnet.synset("camel.n.01")

In [50]:
print(w1)
print(w2)

Synset('horse.n.01')
Synset('camel.n.01')


In [51]:
w1.wup_similarity(w2)

0.8275862068965517

In [52]:
round(w1.wup_similarity(w2)*100,2)

82.76

In [53]:
w1 = wordnet.synset("eagle.n.01")
w2 = wordnet.synset("rat.n.01")

In [54]:
print(w1)
print(w2)

Synset('eagle.n.01')
Synset('rat.n.01')


In [55]:
w1.wup_similarity(w2)

0.72

In [56]:
round(w1.wup_similarity(w2)*100,2)

72.0

In [57]:
w1 = wordnet.synset("elephant.n.01")
w2 = wordnet.synset("ant.n.01")

In [58]:
print(w1)
print(w2)

Synset('elephant.n.01')
Synset('ant.n.01')


In [59]:
w1.wup_similarity(w2)

0.56

In [60]:
round(w1.wup_similarity(w2)*100,2)

56.0

In [61]:
w1 = wordnet.synset("good.a.01")
w2 = wordnet.synset("bad.a.01")

In [62]:
print(w1)
print(w2)

Synset('good.a.01')
Synset('bad.a.01')


In [63]:
w1.wup_similarity(w2)

0.5

In [64]:
round(w1.wup_similarity(w2)*100,2)

50.0

In [67]:
w1 = wordnet.synset("computer.n.01")
w2 = wordnet.synset("dog.n.01")

In [68]:
print(w1)
print(w2)

Synset('computer.n.01')
Synset('dog.n.01')


In [70]:
w1.wup_similarity(w2)

0.4444444444444444

In [69]:
round(w1.wup_similarity(w2)*100,2)

44.44