In [1]:
# importing libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import pandas as pd
import numpy as np
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict

In [2]:
# Tokenization of paragraphs/sentences
text = "Mary had a little lamb. Her fleece was white as snow"
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [3]:
# Tokenization of words
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


In [4]:
# Stopwords
custom_stopwords = set(stopwords.words('english')+list(punctuation))
print(custom_stopwords)
wordsWOStopwords = [word for word in word_tokenize(text) if word not in custom_stopwords]
print(wordsWOStopwords)

{'needn', '.', 'ours', 'aren', 'ma', 'isn', 'will', '_', '>', 'haven', "doesn't", 'so', 'just', 'should', 'too', 'and', 'don', 'he', '?', 'is', 'couldn', 'having', 'where', 'ourselves', 'be', "hasn't", 'yourselves', 's', "mustn't", 'who', 'over', 'y', 'of', 'were', '&', '[', 'against', 'what', 'hadn', '%', '~', 'very', '+', 'itself', 'or', 'the', 'in', 'mustn', 'up', 'my', "'", "you're", "she's", 'more', 'herself', ']', 'other', '-', 'these', 'can', 'further', "wouldn't", 'both', 'which', 'such', 'her', "that'll", 'mightn', 'those', 'its', 'was', 'd', '$', 'didn', 'down', 'weren', 'me', 'shan', 'your', 'hers', 'our', 'had', 'this', '@', '/', ')', 'when', 'himself', "should've", 'again', 'below', 'that', 'has', "aren't", '{', "couldn't", "don't", 'before', 't', '\\', 'doing', 'at', 'about', 'it', 'above', 'once', 'if', 'then', 'than', 'o', 'does', "shan't", 'all', 'won', '*', 'until', 'as', 'am', 'here', 'hasn', '}', 'but', 'under', "isn't", '(', "you'll", '"', 'during', "you'd", "it's"

In [5]:
# Bigrams and Ngrams
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords)
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

In [6]:
# Stemming (reducing words to their root)
text2 = "Mary closed on closing night when she was in the mood to close."
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [7]:
# POS tagging (part of speech tagging)(Check if it verb, noun, etc.)
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [8]:
# Disambiguating word meaning
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [9]:
# Word sense disambiguation
from nltk.wsd import lesk
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(sense1, sense1.definition())

sense2 = lesk(word_tokenize("This sea bass was really hard to catch"), "bass")
print(sense2, sense2.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
