In [1]:
import nltk


In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


# Counting vocabulary of words

In [4]:
text7

<Text: Wall Street Journal>

In [5]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [6]:
len(sent7)

18

In [7]:
len(text7)

100676

In [8]:
len(set(text7)) # unique words only

12408

In [10]:
list(set(text7))[:10]

['engineering',
 'Superconductors',
 '*-42',
 'thwart',
 'Landor',
 'Investors',
 'Oak',
 'Tuscany',
 '6.5',
 'Edward']

# Frequency of words

In [11]:
dist = FreqDist(text7)

In [12]:
len(dist)  # return same answer as of len(set(text7))

12408

In [15]:
vocab = dist.keys() 
list(vocab)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [18]:
dist['years']   # returns number of times word 'year' is repeated

115

In [22]:
# words = []
# for i in vocab:
#     if len(i) >5 and dist[i] >100:
#         words.append(i)
# print(words)

[i for i in vocab if len(i)>5 and dist[i]>100]

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

# Normalization and stemming

In [26]:
input1 = "List listed lists listing listings"
norm = input1.lower().split(' ')
print(norm)

['list', 'listed', 'lists', 'listing', 'listings']


In [28]:
stemmer = nltk.PorterStemmer()

In [29]:
[stemmer.stem(t) for t in norm]

['list', 'list', 'list', 'list', 'list']

# Lemmatization

In [31]:
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [32]:
[stemmer.stem(t) for t in udhr[:20]]

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [33]:
lemmatizer = nltk.WordNetLemmatizer()

In [34]:
[lemmatizer.lemmatize(i) for i in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

# Tokenization

# word tokenize

In [35]:
text11 = "Children shouldn't drink a sugary drink before bed."

In [36]:
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [37]:
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

# Sentence tokenize

In [38]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"

In [39]:
nltk.sent_tokenize(text12)

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

In [43]:
print('There are {} sentences in the above paragraph.'.format(len(nltk.sent_tokenize(text12))))

There are 4 sentences in the above paragraph.


# Advanced NLP Tasks with NLTK

In [51]:

nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [53]:
nltk.pos_tag(nltk.word_tokenize(text11))

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [54]:
text12 = "Visiting aunts can be a nuisance"

In [56]:
nltk.pos_tag(nltk.word_tokenize(text12))

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

# Parsing sentence structure

In [57]:
text13 = 'Alice loves Bob'

In [59]:
grammer = nltk.CFG.fromstring('''
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
''')

parser = nltk.ChartParser(grammer)
trees = parser.parse_all(nltk.word_tokenize(text13))
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [63]:
from nltk.corpus import treebank
text14 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text14)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


# POS tagging and parsing ambiguity

In [65]:
text15 = "The old man the boat"
nltk.pos_tag(nltk.word_tokenize(text15))

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [67]:
text16 = "Colorless green ideas sleep furiously"
nltk.pos_tag(nltk.word_tokenize(text16))

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]