## Tokenization

In [1]:
import nltk

In [2]:
text = 'Marry had a little lamb. Her fleece was white as snow'
from nltk.tokenize import word_tokenize, sent_tokenize
sentences = sent_tokenize(text)
sentences

['Marry had a little lamb.', 'Her fleece was white as snow']

In [3]:
# tokenizing words now
words = [ word_tokenize(sent) for sent in sentences]
words
# here the . is treated as new word

[['Marry', 'had', 'a', 'little', 'lamb', '.'],
 ['Her', 'fleece', 'was', 'white', 'as', 'snow']]

## Stop Words Removal

In [4]:
from nltk.corpus import stopwords
from string import punctuation

# type( stopwords.words('English'))
# built set of all stopwords and punctuations to remove them
allStopWords = set( stopwords.words('English') + list(punctuation) )
# allStopWords

In [5]:
# list(punctuation)

In [6]:
wordsWithoutStopWords = [ word for word in word_tokenize(text) if word not in allStopWords ]
wordsWithoutStopWords

['Marry', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']

## Identifying Bigrams

In [9]:
from nltk.collocations import *
# create an object first
# bigram_measures = nltk.collocations.BigramAssocMeasures()
finder_obj = BigramCollocationFinder.from_words( wordsWithoutStopWords )
sorted(finder_obj.ngram_fd.items())
# now we have bigrams with their frequencies.
# We can find trigrams in a similar way.

[(('Her', 'fleece'), 1),
 (('Marry', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

## Stemming and Part of Speech Recognisition

In [10]:
text2 = 'Mary closed on closing night when she was in the mood to close'

* Here observe that closed closing and close can be treated as same word
* We do so using nltk stemming

In [11]:
from nltk.stem.lancaster import LancasterStemmer
stem_obj = LancasterStemmer()
stemmedWords = [ stem_obj.stem(word) for word in word_tokenize(text2) ]
stemmedWords

['mary',
 'clos',
 'on',
 'clos',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'clos']

Observe how all forms of close are stemmed to clos

In [14]:
'''Parts of Speech Tagging'''
nltk.pos_tag( word_tokenize(text2) )

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

* Mary : Noun
* closed : Verb
and so on

## Disambiguating Word Meaning
---
* Word has different meaning in diff. sentences.
* for eg bass
* wordnet is kind of like dictionary in python

In [22]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print( ss , ss.definition() )

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [40]:
from nltk.wsd import lesk
meaning1 = lesk( word_tokenize('Sing in a lower tone, along with the bass') , 'bass' )
print( meaning1 , meaning1.definition() )

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [41]:
meaning2 = lesk( word_tokenize('The sea bass was really hard to catch'), 'bass')
print( meaning2 , meaning2.definition() )

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
