In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Prashant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Prashant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Prashant/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#### Read Text and Tokenize
Read input text, and tokenize it using sentence tokenizer to break it into separate sentences, and then tokenize it again using word tokenizer to break it down to unique words.

From this, we need to further remove stop words (which do not add much meaning to the sentence), and remove punctuation marks.

In [5]:
text="Mary had a little lamb. Her fleece was white as snow"
from nltk.tokenize import word_tokenize, sent_tokenize
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [6]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


In [22]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english') + list(punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Prashant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


#### Identifying Bigrams
We can now identify bigrams (pair of 2 words) occuring together, and number of times they have occured together.

In [14]:
# Identifying Bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords)
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

#### Stemming
Different morphological words will be considered unique tokens, unless we identify the root word.
This affects us when we are trying to count number of times a word has occured in a piece of text. 

So we need to normalize variation of a word into its root form, this is called Stemming. 

In [16]:
text2 = "Mary closed on closing night when she was in the mood to close"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos']


In [23]:
#download below for parts of speech detection
nltk.download('averaged_perceptron_tagger')
# identify part of speech in text (noun,verb..etc)
nltk.pos_tag(word_tokenize(text2))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Prashant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

#### Word Sense Disambiguation
Here we identify meaning of words in a text, based on context in which it is used.

In [24]:
# import wordnet from nltk corpus, it is a lexicon (like thesauras)
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
# find various definitions of a given word
for ss in wn.synsets('bass'):
    print(ss, ss.definition())


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Prashant/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [25]:
# import lesk algorithm for word sense disambiguation
from nltk.wsd import lesk
# now lets find the most appropriate meaning of a word in a given sentence
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [27]:
# another example
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"), "bass")
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
