In [2]:
# base parsing

In [3]:
import nltk

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [5]:
text = 'Mary had a little lamb. Her fleece was white as snow'

In [6]:
sentences = sent_tokenize(text)
grouped_words = [word_tokenize(sentence) for sentence in sentences]

In [7]:
print(grouped_words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


In [8]:
# stopwords

In [9]:
from nltk.corpus import stopwords
from string import punctuation

In [10]:
generic_stopwords = set(stopwords.words('english') + list(punctuation))

In [11]:
pruned_words = [word for word in word_tokenize(text) if word not in generic_stopwords]

In [12]:
print(pruned_words)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


In [13]:
# n-grams

In [14]:
from nltk.collocations import *

In [15]:
sorted_bygrams = sorted(BigramCollocationFinder.from_words(pruned_words).ngram_fd.items())

In [16]:
print(sorted_bygrams)

[(('Her', 'fleece'), 1), (('Mary', 'little'), 1), (('fleece', 'white'), 1), (('lamb', 'Her'), 1), (('little', 'lamb'), 1), (('white', 'snow'), 1)]


In [17]:
# stemming

In [18]:
from nltk.stem.lancaster import LancasterStemmer

In [19]:
stemmer = LancasterStemmer()
stemming_text = 'Mary closed on closing night when she was in the mood to close.'

In [20]:
stemmed_words = [stemmer.stem(word) for word in word_tokenize(stemming_text)]

In [21]:
print(stemmed_words)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [22]:
# Part Of Speech tagging

In [23]:
from nltk import pos_tag

In [24]:
tagged_words = pos_tag(word_tokenize(stemming_text))

In [25]:
print(tagged_words)

[('Mary', 'NNP'), ('closed', 'VBD'), ('on', 'IN'), ('closing', 'NN'), ('night', 'NN'), ('when', 'WRB'), ('she', 'PRP'), ('was', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('mood', 'NN'), ('to', 'TO'), ('close', 'VB'), ('.', '.')]


In [26]:
# disambiguating

In [27]:
from nltk.corpus import wordnet
from nltk.wsd import lesk

In [28]:
ambiguous_text = 'sing in a lower tone, along with the bass'
ambiguous_word = 'bass'

In [29]:
tokenized_text = word_tokenize(ambiguous_text)

In [34]:
disambigued_meaning = lesk(tokenized_text, ambiguous_word)

In [35]:
print(disambigued_meaning, disambigued_meaning.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments
