## Basic NLP Tasks using NLTK -> Natural Language Tool Kit

In [2]:
import nltk
#nltk.download()
from nltk.book import *
# To get text corpora
# This could make us download the following -
#nltk.download('inaugural')
#nltk.download('nps_chat')
#nltk.download('webtext')
#nltk.download('treebank')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [4]:
# To view what sentences are there in each corpora
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [5]:
sent1

['Call', 'me', 'Ishmael', '.']

In [6]:
# Counting vocab of words
text7

<Text: Wall Street Journal>

In [7]:
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [8]:
len(sent7)

18

In [9]:
len(text7)

100676

In [10]:
# unique words

len(set(text7))

12408

In [11]:
# Listing first 10 unique words
list(set(text7))[:10]

['center',
 'hydraulically',
 '185.9',
 'Publishing',
 'grant',
 'desultory',
 'codified',
 'You',
 '99.1',
 'freshman']

In [12]:
# Frequency of words
dist=FreqDist(text7)
dist

FreqDist({',': 4885, 'the': 4045, '.': 3828, 'of': 2319, 'to': 2164, 'a': 1878, 'in': 1572, 'and': 1511, '*-1': 1123, '0': 1099, ...})

In [13]:
len(dist)

12408

In [14]:
vocab1=dist.keys()
list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [15]:
dist['four']

20

In [16]:
# Words that are greater than 5 letters and appear over 100 times in the text
freqwords=[w for w in vocab1 if len(w)>5 and dist[w]>100]
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

### Normalization and Stemming

In [17]:
input1 = "List listed lists listing listings"
words1=input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [18]:
# All words are changed into list -> Gives words stem
porter=nltk.PorterStemmer()

[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

### Lemmatization -> This will meaningfully shorten the words and the resulting stem is valid

In [19]:
udhr=nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [20]:
[porter.stem(t) for t in udhr]

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of',
 'all',
 'member',
 'of',
 'the',
 'human',
 'famili',
 'is',
 'the',
 'foundat',
 'of',
 'freedom',
 ',',
 'justic',
 'and',
 'peac',
 'in',
 'the',
 'world',
 ',',
 'wherea',
 'disregard',
 'and',
 'contempt',
 'for',
 'human',
 'right',
 'have',
 'result',
 'in',
 'barbar',
 'act',
 'which',
 'have',
 'outrag',
 'the',
 'conscienc',
 'of',
 'mankind',
 ',',
 'and',
 'the',
 'advent',
 'of',
 'a',
 'world',
 'in',
 'which',
 'human',
 'be',
 'shall',
 'enjoy',
 'freedom',
 'of',
 'speech',
 'and',
 'belief',
 'and',
 'freedom',
 'from',
 'fear',
 'and',
 'want',
 'ha',
 'been',
 'proclaim',
 'as',
 'the',
 'highest',
 'aspir',
 'of',
 'the',
 'common',
 'peopl',
 ',',
 'wherea',
 'it',
 'is',
 'essenti',
 ',',
 'if',
 'man',
 'is',
 'not',
 'to',
 'be',
 'compel',
 'to',
 'have',
 'recours',
 '

In [21]:
# Using Lemmatization results in valid words
WNlemma=nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

# In this, upper and lower case words are considered to be different and did not lemmarize

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

### Tokenization

In [23]:
# Splitting a sentence into words/tokens
text11="Children shouldn't drink a sugary drink before bed."

In [24]:
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [26]:
## NLTK tokenizer
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [32]:
# Sentence Splitting from a long string

text12="This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is! "

In [33]:
sentences=nltk.sent_tokenize(text12)
len(sentences)

4

In [34]:
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

## Advanced NLP Tasks using NLTK 

In [35]:
# Part of speech (POS) tagging 

# To get information about the word classes

nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [37]:
text11="Children shouldn't drink a sugary drink before bed."
text13=nltk.word_tokenize(text11)
text13

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [38]:
# Running POS tagger
nltk.pos_tag(text13)


# NNP - Plural noun, MD- Model word, RB -> end word, VB -> Verb, DT-> determiner, JJ-> adjective, VBG-> Gerund
# This could be used for feature engineering

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [39]:
# Ambiquity in POS tagging -> Visiting could be a gerund or verb . nltk displays the most used one in the corpus
text14= nltk.word_tokenize("Visting aunts can be nuisance")
nltk.pos_tag(text14)

[('Visting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('nuisance', 'JJ')]

In [40]:
# Parsing sentence structure
text15=nltk.word_tokenize("Alice loves Bob")
text15

['Alice', 'loves', 'Bob']

In [43]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

In [44]:
grammar

<Grammar with 5 productions>

In [47]:
parser=nltk.ChartParser(grammar)
trees=parser.parse_all(text15)
for tree in trees:
    print (tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [49]:
# Parsing could be ambiguous for this sentence
text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1 = nltk.data.load('mygrammar.cfg')
grammar1

# .cfg could be the file for grammar

In [None]:
# parser=nltk.ChartParser(grammar1)
trees=parser.parse_all(text16)
for tree in trees:
    print(tree)

In [52]:
from nltk.corpus import treebank
#text17=treebank.parsed_sents('wsj_001.mrg')[0]

### POS tagging and parsing complexity

In [54]:

text18=nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [55]:
text19=nltk.word_tokenize('Colorless green ideas sleep furiously')
nltk.pos_tag(text19)

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]