# Module 2 (Python 3)

## Basic NLP Tasks with NLTK

In [1]:
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Counting vocabulary of words

In [2]:
# text 7 is about Wall street Journal
text7

<Text: Wall Street Journal>

In [3]:
# look at the senetence at text
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [4]:
# look at the senetence 7 from text7
sent7

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [5]:
# there are 18 words in sentence 7
len(sent7)

18

In [6]:
# the total number of word in text7
len(text7)

100676

In [7]:
# the number of unique words
len(set(text7))

12408

In [8]:
# type out first 10 words from the set, if there is u in front of word (python 2), the 'u' means UTF-8 coding (u'bottom') 
list(set(text7))[:10]

['Kong',
 'WTVJ',
 'rain',
 'each',
 'drew',
 'Deposits-a',
 'readings',
 'Remember',
 'Osaka',
 'counterweight']

### Frequency of words

In [9]:
# look at the frequency of words, frequency distribution
dist = FreqDist(text7)
len(dist)

12408

In [10]:
# look at the actual words
vocab1 = dist.keys()
#vocab1[:10] in the python 2, use this
# In Python 3 dict.keys() returns an iterable view instead of a list
list(vocab1)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [11]:
# look at the 'four' word frequency in dist list
dist['four']

20

In [12]:
# find out the words that length are more than 5 and the freuency are more than 100
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

### Normalization and stemming

In [13]:
# the same words but in different forms
input1 = "List listed lists listing listings"
# lowercase all the words and split them on space
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [14]:
# create a stemmer called porter
porter = nltk.PorterStemmer()
# get the stem words for all of them
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

### Lemmatization

In [15]:
# udhr is a corpus of universal declaration of human rights
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [16]:
# note the univers, digniti and so on are not valid words
[porter.stem(t) for t in udhr[:20]] # Still Lemmatization

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [17]:
# lemmatization: stemming, but resulting stems are all valid words
# we can see the first Rights has not been changed to right because the Capital R, the second rights has been changed to right
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

### Tokenization

In [18]:
text11 = "Children shouldn't drink a sugary drink before bed."
# the split function keep the full stop .
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [19]:
# the full stop . was seperated, and shouldn't was also seperated
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [20]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
# split the sentences
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

In [21]:
# we can see U.S. was not regarded as a sentence
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

## Advanced NLP Tasks with NLTK

### POS tagging (POS means Part Of Speech tagging)

In [22]:
nltk.help.upenn_tagset('MD') #use help chek what 'MD' means, 'MD' is from upenn_tagset

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [23]:
text13 = nltk.word_tokenize(text11) # 'shouldn't' in text11 will be counted as two words 'should' 'n't'
nltk.pos_tag(text13) # run pos_tag command to get the tag

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [24]:
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14)

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

In [25]:
# Parsing sentence structure
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""") # contextual grammer input statement, CFG means Context Free Grammar

parser = nltk.ChartParser(grammar) # use grammer to create a 'parse'
trees = parser.parse_all(text15) # parse the sentence, which gives parse trees
for tree in trees:
    print(tree) # 'S' below means sentence, 'NP' menas noun phrase, 'V' menas verb, 'VP' means verb phrase

(S (NP Alice) (VP (V loves) (NP Bob)))


In [29]:
text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1 = nltk.data.load('mygrammar.cfg') # load the grammer you have written.
grammar1

<Grammar with 13 productions>

In [30]:
parser = nltk.ChartParser(grammar1)
trees = parser.parse_all(text16)  # text16 is 'I saw the man with a telescope'
for tree in trees:
    print(tree)  # 'PP' below means preposition phrase, there are two possible parses in text16

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))


In [31]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0] # a big collection of parse tree can be used from Wall Street Journal
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### POS tagging and parsing ambiguity

In [32]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [33]:
text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
nltk.pos_tag(text19) # a big collection of parse tree can be used from Wall Street Journal

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]