Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit
Steven Bird, Ewan Klein, and Edward Loper
http://www.nltk.org/book/

# Chapter 3. Processing Raw Text

In [1]:
%matplotlib inline

import nltk, re, pprint
from nltk import word_tokenize

## 3.5 Useful Applications of Regular Expressions

### Extracting Word Pieces

In [2]:
import re

In [3]:
word = 'supercalifragilisticexpialidocious'

In [4]:
re.findall('[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [5]:
len(re.findall('[aeiou]', word))

16

In [6]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [10]:
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall('[aeiou]{2,}', word))

In [11]:
fd.items()

dict_items([('ea', 476), ('oi', 65), ('ou', 329), ('io', 549), ('ee', 217), ('ie', 331), ('ui', 95), ('ua', 109), ('ai', 261), ('ue', 105), ('ia', 253), ('ei', 86), ('iai', 1), ('oo', 174), ('au', 106), ('eau', 10), ('oa', 59), ('oei', 1), ('oe', 15), ('eo', 39), ('uu', 1), ('eu', 18), ('iu', 14), ('aii', 1), ('aiia', 1), ('ae', 11), ('aa', 3), ('oui', 6), ('ieu', 3), ('ao', 6), ('iou', 27), ('uee', 4), ('eou', 5), ('aia', 1), ('uie', 3), ('iao', 1), ('eei', 2), ('uo', 8), ('uou', 5), ('eea', 1), ('ueui', 1), ('ioa', 1), ('ooi', 1)])

In [13]:
[int(n) for n in re.findall('\d+', '2009-12-31')]

[2009, 12, 31]

### Doing More with Word Pieces

In [14]:
regexp = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

In [15]:
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [16]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')

In [17]:
nltk.tokenwrap(compress(w) for w in english_udhr[:75])

'Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and'

In [18]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

In [19]:
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [20]:
cfd = nltk.ConditionalFreqDist(cvs)

In [21]:
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [22]:
cv_word_pairs = [
    (cv, w) for w in rotokas_words
    for cv in re.findall(r'[ptksvr][aeiou]', w)
]

In [23]:
cv_index = nltk.Index(cv_word_pairs)

In [24]:
cv_index['su']

['kasuari']

In [25]:
cv_index['po']

['kaapo',
 'kaapopato',
 'kaipori',
 'kaiporipie',
 'kaiporivira',
 'kapo',
 'kapoa',
 'kapokao',
 'kapokapo',
 'kapokapo',
 'kapokapoa',
 'kapokapoa',
 'kapokapora',
 'kapokapora',
 'kapokaporo',
 'kapokaporo',
 'kapokari',
 'kapokarito',
 'kapokoa',
 'kapoo',
 'kapooto',
 'kapoovira',
 'kapopaa',
 'kaporo',
 'kaporo',
 'kaporopa',
 'kaporoto',
 'kapoto',
 'karokaropo',
 'karopo',
 'kepo',
 'kepoi',
 'keposi',
 'kepoto']

### Finding Word Stems

In [26]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

In [27]:
re.findall('^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['ing']

In [28]:
re.findall('^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['processing']

In [29]:
re.findall('^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

In [30]:
re.findall('^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('processe', 's')]

In [32]:
re.findall('^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

In [35]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

[('language', '')]

In [36]:
def stem(word):
    regexp = '^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [37]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [38]:
tokens = nltk.word_tokenize(raw)

In [39]:
[stem(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

### Searching Tokenized Text

In [41]:
from nltk.corpus import gutenberg, nps_chat

In [42]:
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

In [46]:
moby.findall("<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [48]:
chat = nltk.Text(nps_chat.words())
chat.findall("<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [49]:
chat.findall("<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [51]:
nltk.re_show('\d+', 'aaa-bbbb, 1234, oooo0')

aaa-bbbb, {1234}, oooo{0}


In [52]:
nltk.app.nemo()

In [53]:
from nltk.corpus import brown

In [57]:
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall("<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [58]:
hobbies_learned.findall("<as> <\w*> <as> <\w*s>")

as coppery as Delawares; as well as injuries; as much as was; as well
as personalities; as much as glass; as importantly as his
