# Tokenize text into words/tokens (using word_tokenize) or sentence (using sent_tokenize)

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Hi my name is Syed. This is a sample text"
sentence_list = sent_tokenize(text)
word_list = word_tokenize(text)
print(sentence_list)
print(word_list) # NOTE: Even punctuations like '.' are also treated as seperate token

['Hi my name is Syed.', 'This is a sample text']
['Hi', 'my', 'name', 'is', 'Syed', '.', 'This', 'is', 'a', 'sample', 'text']


# Remove Stopwords and punctuations

In [6]:
from nltk.corpus import stopwords # Stopwords are common words (like 'a', 'an', 'the') which has unnecessary information
from string import punctuation # For getting punctuation symbols

filter_words = set(stopwords.words('english')+list(punctuation))
nonstopword_list = [word for word in word_list if word not in filter_words]
print(nonstopword_list)

['Hi', 'name', 'Syed', 'This', 'sample', 'text']


# Bi-grams are 2 words/tokens that occour together to form a different meaning. Example: 'Tamil Nadu'

# We can identify such words, if we analyse how commonly the words are occuring together. Example: 'Tamil Nadu is a really good place. You should visit Tamil Nadu!'. In this, the word 'Tamil Nadu' has more frequency than the other words.

In [8]:
from nltk.collocations import BigramCollocationFinder # We can also use TrigramCollocationFinder to find Tri-grams

finder = BigramCollocationFinder.from_words(nonstopword_list)
sorted(finder.ngram_fd.items())

[(('Hi', 'name'), 1),
 (('Syed', 'This'), 1),
 (('This', 'sample'), 1),
 (('name', 'Syed'), 1),
 (('sample', 'text'), 1)]

# Part of Speech tagging : To find a verb, noun, adj, ... on in a sentence

In [9]:
from nltk import pos_tag
pos_tag(nonstopword_list) # Use http://www.nltk.org/book/ch05.html to find the abbrevation of the result

[('Hi', 'NNP'),
 ('name', 'NN'),
 ('Syed', 'NNP'),
 ('This', 'DT'),
 ('sample', 'NN'),
 ('text', 'NN')]

# Word sense disambiguation : Idenitfying the meaning of the word based on the context. Example: In sentences 'It was a cool movie' and 'I like tall grass of cool water', the word 'cool' has different meaning depending on the context

In [15]:
from nltk.corpus import wordnet as wn # Wordnet is kind of like thesaurus which has information about word and their relationship
from nltk.wsd import lesk # lesk allows to perform word sense disambiguation

for synset in wn.synsets('bass'): # synsets allows to get all possible defintion for a given word
    print(synset, synset.definition())
print("-"*50)
word_classified = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass') # Lesk will identify the correct meaning of the word for a given sentence
print(word_classified, word_classified.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range
--------------------------------------------------
Synset('bass.n.07') the member with the lowest range of a family of musical instruments


#  Stemming : Sometimes just tokenising the words wont be enough. For example: 'closing, closed, closer' will be treated as different words though they have the same meaning as the of these parent word is 'close'. To consider the words are same, we can remove the end words and consider these words as 'clos' (because it is the common word for both 'closing', 'closed' and 'closer').


In [12]:
from nltk.stem.lancaster import LancasterStemmer # There are lot of stemming alogirthm. LancasterStemmer is one such algorithm

inital_text = "Syed closed the window on closing night when he was in the mood to close."
stemmer = LancasterStemmer()
stemmed_words = [stemmer.stem(word) for word in word_tokenize(inital_text)]
print(stemmed_words)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']
