In [6]:
import nltk

In [7]:
text = "Mary had a little lamb. Her fleece was white as snow. New York is a beautiful city New York"

In [9]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### Tokenize by senteces

In [10]:
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow.', 'New York is a beautiful city New York']


### Tokenize the sentences words

In [11]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow', '.'], ['New', 'York', 'is', 'a', 'beautiful', 'city', 'New', 'York']]


### Adding Custom StopWords

In [12]:
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english')+list(punctuation))

### Print the sentence without the StopWords

In [13]:
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow', 'New', 'York', 'beautiful', 'city', 'New', 'York']


### Looking for Bigrams

In [14]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords)

In [15]:
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('New', 'York'), 2),
 (('York', 'beautiful'), 1),
 (('beautiful', 'city'), 1),
 (('city', 'New'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('snow', 'New'), 1),
 (('white', 'snow'), 1)]

### Stemming a sentence

In [36]:
text2 = "Mary closed on closing night when she was in the mood to close."

In [37]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [38]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Printing word's type

In [39]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [40]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

### Printing Definitions

In [54]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('models'):
    print(ss, ss.definition())

Synset('model.n.01') a hypothetical description of a complex entity or process
Synset('model.n.02') a type of product
Synset('model.n.03') a person who poses for a photographer or painter or sculptor
Synset('model.n.04') representation of something (sometimes on a smaller scale)
Synset('exemplar.n.01') something to be imitated
Synset('model.n.06') someone worthy of imitation
Synset('model.n.07') a representative form or pattern
Synset('mannequin.n.01') a woman who wears clothes to display fashions
Synset('model.n.09') the act of representing something (usually on a smaller scale)
Synset('model.v.01') plan or create according to a model or models
Synset('model.v.02') form in clay, wax, etc
Synset('model.v.03') assume a posture as for artistic purposes
Synset('model.v.04') display (clothes) as a mannequin
Synset('model.v.05') create a representation or model of
Synset('model.v.06') construct a model of


In [42]:
from nltk.wsd import lesk
sense1 = lesk(word_tokenize('Sing in a lower tone, along with the bass'), 'bass')
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


### Own sentence with definition

In [52]:
from nltk.wsd import lesk
sense1 = lesk(word_tokenize('Britains most crashed cars have been revealed, with Vauxhall models three of the four most likely to be in an accident.'), 'models')
print(sense1, sense1.definition())

Synset('model.v.01') plan or create according to a model or models
