In [5]:
import nltk
nltk.download('all')

[nltk_data] Downloading package punkt to /home/lab-pc-10/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Tokenize

In [6]:
text = "Mary had a little lamb. Her fleese was white as snow."
from nltk.tokenize import word_tokenize, sent_tokenize

In [7]:
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleese was white as snow.']


In [8]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleese', 'was', 'white', 'as', 'snow', '.']]


# Removing stop words

In [13]:
from nltk.corpus import stopwords
#nltk.download('stopwords')
from string import punctuation
customStopWords = set(stopwords.words('english') + list(punctuation))

[nltk_data] Downloading package stopwords to /home/lab-
[nltk_data]     pc-10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
WOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(WOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleese', 'white', 'snow']


# Identify Bigrams

In [16]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(WOStopwords)

In [17]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleese'), 1),
 (('Mary', 'little'), 1),
 (('fleese', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

# Stemming and Parts of Speech tagging 

In [18]:
text2 = "Mary closed on closing nights when she was in the mood to close."

In [19]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

In [20]:
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [24]:
#nltk.download('all')
nltk.pos_tag(word_tokenize(text2))

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package ce

[nltk_data]    |   Unzipping corpora/senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/sentiwordnet.zip.
[nltk_data]    | Downloading package sentence_polarity to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/sentence_polarity.zip.
[nltk_data]    | Downloading package shakespeare to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/shakespeare.zip.
[nltk_data]    | Downloading package sinica_treebank to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package state_union to /home/lab-
[nltk_data]    |     pc-10/nltk_data...
[nltk_data]    |   Unzipping corpora/state_un

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('nights', 'NNS'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

# Word sense disambiguation 

In [25]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [26]:
from nltk.wsd import lesk

In [27]:
sense1 = lesk(word_tokenize("Sign in a lower tone, along with the bass"), 'bass')
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [28]:
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"), 'bass')
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
