Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit
Steven Bird, mEwan Klein, and Edward Loper
http://www.nltk.org/book/

# Chapter 03 - Processing Raw Text

## 3.1 Accessing Text from the Web and from Disk

### Electronic Books

In [0]:
%matplotlib inline

In [0]:
import nltk, re, pprint
from nltk import word_tokenize

In [0]:
from urllib import request

In [0]:
# url = "http://www.gutenberg.org/files/2554/2554.txt"
url = "http://www.gutenberg.org/files/2554/2554-0.txt"

In [0]:
response = request.urlopen(url)

In [0]:
raw = response.read().decode('utf-8-sig')

In [0]:
type(raw)
# <class 'str'>

str

In [0]:
len(raw)
# 1176893

1176966

In [0]:
raw[:75]
# 'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'

'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'

In [0]:
tokens = word_tokenize(raw)

In [0]:
type(tokens)
# <class 'list'>

list

In [0]:
len(tokens)
# 254354

257727

In [0]:
tokens[:10]
# ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

In [0]:
text = nltk.Text(tokens)

In [0]:
type(text)
# <class 'nltk.text.Text'>

nltk.text.Text

In [0]:
text[1024:1062]
# ['CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in',
#  'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in',
#  'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly',
#  ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.']

['an',
 'exceptionally',
 'hot',
 'evening',
 'early',
 'in',
 'July',
 'a',
 'young',
 'man',
 'came',
 'out',
 'of',
 'the',
 'garret',
 'in',
 'which',
 'he',
 'lodged',
 'in',
 'S.',
 'Place',
 'and',
 'walked',
 'slowly',
 ',',
 'as',
 'though',
 'in',
 'hesitation',
 ',',
 'towards',
 'K.',
 'bridge',
 '.',
 'He',
 'had',
 'successfully']

In [0]:
text.collocation_list()
# Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
# Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
# woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
# great deal; Nikodim Fomitch; young man; Ilya Petrovitch; n't know;
# Project Gutenberg; Dmitri Prokofitch; Andrey Semyonovitch; Hay Market

['Katerina Ivanovna',
 'Pyotr Petrovitch',
 'Pulcheria Alexandrovna',
 'Avdotya Romanovna',
 'Rodion Romanovitch',
 'Marfa Petrovna',
 'Sofya Semyonovna',
 'old woman',
 'Project Gutenberg-tm',
 'Porfiry Petrovitch',
 'Amalia Ivanovna',
 'great deal',
 'young man',
 'Nikodim Fomitch',
 'Ilya Petrovitch',
 'Project Gutenberg',
 'Andrey Semyonovitch',
 'Hay Market',
 'Dmitri Prokofitch',
 'Good heavens']

In [0]:
raw.find("PART I")
# 5338

5335

In [0]:
raw.rfind("End of Project Gutenberg’s Crime")
# 1157743

1157811

In [0]:
raw = raw[5335:1157811]

In [0]:
raw.find("PART I")
# 0

0

### Dealing with HTML

In [0]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"

In [0]:
html = request.urlopen(url).read().decode('utf8')

In [0]:
html[:60]
# '<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [0]:
# raw = nltk.clean_html(html)
# NotImplementedError: To remove HTML markup, use BeautifulSoup's get_text() function

In [0]:
from bs4 import BeautifulSoup

In [0]:
# raw = BeautifulSoup(html).get_text()
raw = BeautifulSoup(html, "lxml").get_text()

In [0]:
tokens = word_tokenize(raw)

In [0]:
tokens
# ['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', ...]

['BBC',
 'NEWS',
 '|',
 'Health',
 '|',
 'Blondes',
 "'to",
 'die',
 'out',
 'in',
 '200',
 "years'",
 'NEWS',
 'SPORT',
 'WEATHER',
 'WORLD',
 'SERVICE',
 'A-Z',
 'INDEX',
 'SEARCH',
 'You',
 'are',
 'in',
 ':',
 'Health',
 'News',
 'Front',
 'Page',
 'Africa',
 'Americas',
 'Asia-Pacific',
 'Europe',
 'Middle',
 'East',
 'South',
 'Asia',
 'UK',
 'Business',
 'Entertainment',
 'Science/Nature',
 'Technology',
 'Health',
 'Medical',
 'notes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Talking',
 'Point',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Country',
 'Profiles',
 'In',
 'Depth',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Programmes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'SERVICES',
 'Daily',
 'E-mail',
 'News',
 'Ticker',
 'Mobile/PDAs',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Text',
 'Only',
 'Feedback',
 'Help',
 'EDITIONS',
 'Change',
 'to',
 'UK',
 'Friday',
 ',',
 '27',
 'September',
 ',',
 '2002',
 ',',
 '11:51',
 'GMT',
 '12:51'

In [0]:
tokens = tokens[110:390]

In [0]:
text = nltk.Text(tokens)

In [0]:
text.concordance('gene')
# Displaying 5 of 5 matches:
# hey say too few people now carry the gene for blondes to last beyond the next
# blonde hair is caused by a recessive gene . In order for a child to have blond
# have blonde hair , it must have the gene on both sides of the family in the g
# ere is a disadvantage of having that gene or by chance . They do n't disappear
# des would disappear is if having the gene was a disadvantage and I do not thin

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


### Processing Search Engine Results

### Processing RSS Feeds

In [0]:
# need to install feedparser
import feedparser

In [0]:
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")

In [0]:
llog['feed']['title']
# 'Language Log'

'Language Log'

In [0]:
len(llog.entries)
# 15

13

In [0]:
post = llog.entries[2]

In [0]:
post.title
# "He's My BF"

'Emoji in Chinese music video lyric'

In [0]:
content = post.content[0].value

In [0]:
content[:70]
# '<p>Today I was chatting with three of our visiting graduate students f'

'<p>From Charles Belov:</p>\n<p style="padding-left: 40px;">I thought I '

In [0]:
# raw = BeautifulSoup(content).get_text()
raw = BeautifulSoup(content, "lxml").get_text()

In [0]:
word_tokenize(raw)
# ['Today', 'I', 'was', 'chatting', 'with', 'three', 'of', 'our', 'visiting',
# 'graduate', 'students', 'from', 'the', 'PRC', '.', 'Thinking', 'that', 'I',
# 'was', 'being', 'au', 'courant', ',', 'I', 'mentioned', 'the', 'expression',
# 'DUI4XIANG4', '\u5c0d\u8c61', '("', 'boy', '/', 'girl', 'friend', '"', ...]

['From',
 'Charles',
 'Belov',
 ':',
 'I',
 'thought',
 'I',
 'was',
 'going',
 'to',
 'be',
 'sending',
 'you',
 'a',
 'case',
 'of',
 'Google',
 'Translate',
 'munging',
 'a',
 'song',
 'lyric',
 'when',
 'translating',
 'it',
 'from',
 'Chinese',
 'to',
 'English',
 '.',
 'Instead',
 ',',
 'I',
 "'m",
 'sending',
 'you',
 'a',
 'case',
 'of',
 'a',
 'Chinese',
 'music',
 'video',
 'making',
 'use',
 'of',
 'an',
 'emoji',
 'in',
 'the',
 'song',
 'lyrics',
 '.',
 'The',
 'song',
 'in',
 'question',
 'is',
 'gǎibiàn',
 '改變',
 '(',
 'Changes',
 ')',
 'by',
 'Taiwan',
 'rocker',
 'Zhāng',
 'Zhènyuè',
 '張震嶽',
 'A-Yue',
 '.',
 'If',
 'I',
 'copy',
 'the',
 'lyrics',
 'from',
 'Rock',
 'Records',
 "'",
 'posting',
 'on',
 'YouTube',
 ',',
 'Google',
 'Translate',
 'translates',
 'the',
 'line',
 'in',
 'question',
 '``',
 'Wǒ',
 'xiǎng',
 'dàbiàn',
 '我想大便',
 "''",
 'as',
 '``',
 'I',
 'want',
 'to',
 'have',
 'a',
 'bowel',
 'movement',
 '.',
 "''",
 'Now',
 'I',
 'am',
 'familiar',
 'wit

### Reading Local Files

In [0]:
f = open('document.txt')

In [0]:
raw = f.read()
raw

'Hello\nWorld!\n'

In [0]:
import os
os.listdir('.')

['.gitignore',
 '.ipynb_checkpoints',
 'document.txt',
 'groucho_grammar.cfg',
 'iu_mien_samp.xml',
 'mygrammar.cfg',
 'nltk-book-chap-x.ipynb',
 'nltk-book-chap01.ipynb',
 'nltk-book-chap02-1.ipynb',
 'nltk-book-chap02-2.ipynb',
 'nltk-book-chap02-3.ipynb',
 'nltk-book-chap02-4.ipynb',
 'nltk-book-chap02-5.ipynb',
 'nltk-book-chap03-1.ipynb',
 'nltk-book-chap03-2.ipynb',
 'nltk-book-chap03-3.ipynb',
 'nltk-book-chap03-4.ipynb',
 'nltk-book-chap04-1.ipynb',
 'nltk-book-chap04-2.ipynb',
 'nltk-book-chap05-1.ipynb',
 'nltk-book-chap05-2.ipynb',
 'nltk-book-chap05-3.ipynb',
 'nltk-book-chap05-4.ipynb',
 'nltk-book-chap05-5.ipynb',
 'nltk-book-chap05-6.ipynb',
 'nltk-book-chap06-1.ipynb',
 'nltk-book-chap06-2.ipynb',
 'nltk-book-chap06-3.ipynb',
 'nltk-book-chap06-4_5.ipynb',
 'nltk-book-chap07-1.ipynb',
 'nltk-book-chap07-2.ipynb',
 'nltk-book-chap07-3.ipynb',
 'nltk-book-chap07-4.ipynb',
 'nltk-book-chap07-5.ipynb',
 'nltk-book-chap07-6.ipynb',
 'nltk-book-chap08-1_2.ipynb',
 'nltk-book-

In [0]:
f = open('document.txt', 'r')
for line in f:
    print(line.strip())

Hello
World!


In [0]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
# raw = open(path, 'rU').read()
# /Users/hisakato/Documents/anaconda5/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: 
# DeprecationWarning: 'U' mode is deprecated
raw = open(path, 'r').read()

### Extracting Text from PDF, MSWord, and Other Binary Formats

Third-party libraries such as pypdf and pywin32 provide access to these formats.

### Capturing User Input

In [0]:
s = input("Enter some text: ")
# Enter some text: On an exceptionally hot evening early in July

Enter some text: On an exceptionally hot evening early in July


In [0]:
print("You typed", len(word_tokenize(s)), "words.")
# You typed 8 words.

You typed 8 words.


### The NLP Pipeline

In [0]:
raw = open('document.txt').read()

In [0]:
type(raw)
# <class 'str'>

str

In [0]:
tokens = word_tokenize(raw)

In [0]:
type(tokens)
# <class 'list'>

list

In [0]:
words = [w.lower() for w in tokens]

In [0]:
type(words)
# <class 'list'>

list

In [0]:
vocab = sorted(set(words))

In [0]:
type(vocab)
# <class 'list'>

list

In [0]:
vocab.append('blog')

In [0]:
raw.append('blog')
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# AttributeError: 'str' object has no attribute 'append'

AttributeError: 'str' object has no attribute 'append'

In [0]:
query = 'Who knows?'

In [0]:
beatles = ['john', 'paul', 'george', 'ringo']

In [0]:
query + beatles
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# TypeError: cannot concatenate 'str' and 'list' objects

TypeError: must be str, not list

## 3.2   Strings: Text Processing at the Lowest Level

### Basic Operations with Strings

In [0]:
monty = 'Monty Python'

In [0]:
monty
# 'Monty Python'

'Monty Python'

In [0]:
circus = "Monty Python's Flying Circus"

In [0]:
circus
# "Monty Python's Flying Circus"

"Monty Python's Flying Circus"

In [0]:
circus = 'Monty Python\'s Flying Circus'

In [0]:
circus
# "Monty Python's Flying Circus"

"Monty Python's Flying Circus"

In [0]:
circus = 'Monty Python's Flying Circus'
#   File "<stdin>", line 1
#     circus = 'Monty Python's Flying Circus'
                           ^
# SyntaxError: invalid syntax

SyntaxError: invalid syntax (<ipython-input-9-01e0177f4dab>, line 1)

In [0]:
couplet = "Shall I compare thee to a Summer's day?"\
          "Thou are more lovely and more temperate:"

In [0]:
print(couplet)
# Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:

Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:


In [0]:
couplet = ("Rough winds do shake the darling buds of May,"
           "And Summer's lease hath all too short a date:")

In [0]:
print(couplet)
# Rough winds do shake the darling buds of May,And Summer's lease hath all too short a date:

Rough winds do shake the darling buds of May,And Summer's lease hath all too short a date:


In [0]:
couplet = """Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:"""

In [0]:
print(couplet)
# Shall I compare thee to a Summer's day?
# Thou are more lovely and more temperate:

Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:


In [0]:
couplet = '''Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date:'''

In [0]:
print(couplet)
# Rough winds do shake the darling buds of May,
# And Summer's lease hath all too short a date:

Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date:


In [0]:
'very' + 'very' + 'very'
# 'veryveryvery'

'veryveryvery'

In [0]:
'very' * 3
# 'veryveryvery'

'veryveryvery'

In [0]:
a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]

In [0]:
b = [' ' * 2 * (7 - i) + 'very' * i for i in a]

In [0]:
for line in b:
    print(line)

            very
          veryvery
        veryveryvery
      veryveryveryvery
    veryveryveryveryvery
  veryveryveryveryveryvery
veryveryveryveryveryveryvery
  veryveryveryveryveryvery
    veryveryveryveryvery
      veryveryveryvery
        veryveryvery
          veryvery
            very


In [0]:
'very' - 'y'
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# TypeError: unsupported operand type(s) for -: 'str' and 'str'

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [0]:
'very' / 2
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# TypeError: unsupported operand type(s) for /: 'str' and 'int'

TypeError: unsupported operand type(s) for /: 'str' and 'int'

### Printing Strings

In [0]:
print(monty)
# Monty Python

Monty Python


In [0]:
grail = 'Holy Grail'

In [0]:
print(monty + grail)
# Monty PythonHoly Grail

Monty PythonHoly Grail


In [0]:
print(monty, grail)
# Monty Python Holy Grail

Monty Python Holy Grail


In [0]:
print(monty, "and the", grail)
# Monty Python and the Holy Grail

### Accessing Individual Characters

In [0]:
monty[0]
# 'M'

'M'

In [0]:
monty[3]
# 't'

't'

In [0]:
monty[5]
# ' '

' '

In [0]:
monty[20]
# Traceback (most recent call last):
#   File "<stdin>", line 1, in ?
# IndexError: string index out of range

IndexError: string index out of range

In [0]:
monty[-1]
# 'n'

'n'

In [0]:
monty[5]
# ' '

' '

In [0]:
monty[-7]
# ' '

' '

In [0]:
sent = 'colorless green ideas sleep furiously'

In [0]:
for char in sent:
    print(char, end=' ')
# c o l o r l e s s   g r e e n   i d e a s   s l e e p   f u r i o u s l y

c o l o r l e s s   g r e e n   i d e a s   s l e e p   f u r i o u s l y 

In [0]:
from nltk.corpus import gutenberg

In [0]:
raw = gutenberg.raw('melville-moby_dick.txt')

In [0]:
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())

In [0]:
fdist.most_common(5)
# [('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]

[('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]

In [0]:
[char for (char, count) in fdist.most_common()]
# ['e', 't', 'a', 'o', 'n', 'i', 's', 'h', 'r', 'l', 'd', 'u', 'm', 'c', 'w',
# 'f', 'g', 'p', 'b', 'y', 'v', 'k', 'q', 'j', 'x', 'z']

['e',
 't',
 'a',
 'o',
 'n',
 'i',
 's',
 'h',
 'r',
 'l',
 'd',
 'u',
 'm',
 'c',
 'w',
 'f',
 'g',
 'p',
 'b',
 'y',
 'v',
 'k',
 'q',
 'j',
 'x',
 'z']

### Accessing Substrings

In [0]:
monty[6:10]
# 'Pyth'

'Pyth'

In [0]:
monty[-12:-7]
# 'Monty'

'Monty'

In [0]:
monty[:5]
# 'Monty'

'Monty'

In [0]:
monty[6:]
# 'Python'

'Python'

In [0]:
phrase = 'And now for something completely different'

In [0]:
if 'thing' in phrase:
    print('found "thing"')
# found "thing"

found "thing"


In [0]:
monty.find('Python')
# 6

6

### More Operations on Strings

### The Difference Between Lists and Strings

In [0]:
query = 'Who knows?'

In [0]:
beatles = ['John', 'Paul', 'George', 'Ringo']

In [0]:
query[2]
# 'o'

'o'

In [0]:
beatles[2]
# 'George'

'George'

In [0]:
query[:2]
# 'Wh'

'Wh'

In [0]:
beatles[:2]
# ['John', 'Paul']

['John', 'Paul']

In [0]:
query + " I don't"
# "Who knows? I don't"

"Who knows? I don't"

In [0]:
beatles + 'Brian'
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# TypeError: can only concatenate list (not "str") to list

TypeError: can only concatenate list (not "str") to list

In [0]:
beatles + ['Brian']
# ['John', 'Paul', 'George', 'Ringo', 'Brian']

['John', 'Paul', 'George', 'Ringo', 'Brian']

In [0]:
beatles[0] = "John Lennon"

In [0]:
del beatles[-1]

In [0]:
beatles
# ['John Lennon', 'Paul', 'George']

['John Lennon', 'Paul', 'George']

In [0]:
query[0] = 'F'
# Traceback (most recent call last):
#   File "<stdin>", line 1, in ?
# TypeError: object does not support item assignment

TypeError: 'str' object does not support item assignment

## 3.3   Text Processing with Unicode

### What Is Unicode?

### Extracting Encoded Text from Files

In [0]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

In [0]:
f = open(path, encoding='latin2')

In [0]:
for line in f:
    line = line.strip()
    print(line)
# Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
# "Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
# Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
# odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
# Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
# archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.

Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


In [0]:
f = open(path, encoding='latin2')

In [0]:
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))
# b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
# b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
# b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
# b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
# b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
# b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'

b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'


In [0]:
ord('ń')
# 324

324

In [0]:
nacute = '\u0144'

In [0]:
nacute
# 'ń'

'ń'

In [0]:
nacute.encode('utf8')
# b'\xc5\x84'

b'\xc5\x84'

In [0]:
import unicodedata

In [0]:
lines = open(path, encoding='latin2').readlines()

In [0]:
line = lines[2]

In [0]:
print(line.encode('unicode_escape'))
# b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'

b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'


In [0]:
for c in line:
    if ord(c) > 127:
        print('{} U+{:04x} {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c)))
# b'\xc3\xb3' U+00f3 LATIN SMALL LETTER O WITH ACUTE
# b'\xc5\x9b' U+015b LATIN SMALL LETTER S WITH ACUTE
# b'\xc5\x9a' U+015a LATIN CAPITAL LETTER S WITH ACUTE
# b'\xc4\x85' U+0105 LATIN SMALL LETTER A WITH OGONEK
# b'\xc5\x82' U+0142 LATIN SMALL LETTER L WITH STROKE

b'\xc3\xb3' U+00f3 LATIN SMALL LETTER O WITH ACUTE
b'\xc5\x9b' U+015b LATIN SMALL LETTER S WITH ACUTE
b'\xc5\x9a' U+015a LATIN CAPITAL LETTER S WITH ACUTE
b'\xc4\x85' U+0105 LATIN SMALL LETTER A WITH OGONEK
b'\xc5\x82' U+0142 LATIN SMALL LETTER L WITH STROKE


In [0]:
line.find('zosta\u0142y')
# 54

54

In [0]:
line = line.lower()

In [0]:
line
# 'niemców pod koniec ii wojny światowej na dolny śląsk, zostały\n'

'niemców pod koniec ii wojny światowej na dolny śląsk, zostały\n'

In [0]:
line.encode('unicode_escape')
# b'niemc\\xf3w pod koniec ii wojny \\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'

b'niemc\\xf3w pod koniec ii wojny \\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'

In [0]:
import re

In [0]:
m = re.search('\u015b\w*', line)

In [0]:
m.group()
# '\u015bwiatowej'

'światowej'

In [0]:
word_tokenize(line)
# ['niemców', 'pod', 'koniec', 'ii', 'wojny', 'światowej', 'na', 'dolny', 'śląsk', ',', 'zostały']

['niemców',
 'pod',
 'koniec',
 'ii',
 'wojny',
 'światowej',
 'na',
 'dolny',
 'śląsk',
 ',',
 'zostały']

### Using Your Local Encoding in Python

## 3.4   Regular Expressions for Detecting Word Patterns

### Using Basic Metacharacters

In [0]:
import re

In [0]:
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [0]:
[w for w in wordlist if re.search('ed$', w)]
# ['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', ...]

['abaissed',
 'abandoned',
 'abased',
 'abashed',
 'abatised',
 'abed',
 'aborted',
 'abridged',
 'abscessed',
 'absconded',
 'absorbed',
 'abstracted',
 'abstricted',
 'accelerated',
 'accepted',
 'accidented',
 'accoladed',
 'accolated',
 'accomplished',
 'accosted',
 'accredited',
 'accursed',
 'accused',
 'accustomed',
 'acetated',
 'acheweed',
 'aciculated',
 'aciliated',
 'acknowledged',
 'acorned',
 'acquainted',
 'acquired',
 'acquisited',
 'acred',
 'aculeated',
 'addebted',
 'added',
 'addicted',
 'addlebrained',
 'addleheaded',
 'addlepated',
 'addorsed',
 'adempted',
 'adfected',
 'adjoined',
 'admired',
 'admitted',
 'adnexed',
 'adopted',
 'adossed',
 'adreamed',
 'adscripted',
 'aduncated',
 'advanced',
 'advised',
 'aeried',
 'aethered',
 'afeared',
 'affected',
 'affectioned',
 'affined',
 'afflicted',
 'affricated',
 'affrighted',
 'affronted',
 'aforenamed',
 'afterfeed',
 'aftershafted',
 'afterthoughted',
 'afterwitted',
 'agazed',
 'aged',
 'agglomerated',
 'aggri

In [0]:
[w for w in wordlist if re.search('^..j..t..$', w)]
# ['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', ...]

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector',
 'unjilted',
 'unjolted',
 'unjustly']

### Ranges and Closures

In [0]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
# ['gold', 'golf', 'hold', 'hole']

['gold', 'golf', 'hold', 'hole']

In [0]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

In [0]:
[w for w in chat_words if re.search('^m+i+n+e+$', w)]
# ['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine',
# 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [0]:
[w for w in chat_words if re.search('^[ha]+$', w)]
# ['a', 'aaaaaaaaaaaaaaaaa', 'aaahhhh', 'ah', 'ahah', 'ahahah', 'ahh',
# 'ahhahahaha', 'ahhh', 'ahhhh', 'ahhhhhh', 'ahhhhhhhhhhhhhh', 'h', 'ha', 'haaa',
# 'hah', 'haha', 'hahaaa', 'hahah', 'hahaha', 'hahahaa', 'hahahah', 'hahahaha', ...]

['a',
 'aaaaaaaaaaaaaaaaa',
 'aaahhhh',
 'ah',
 'ahah',
 'ahahah',
 'ahh',
 'ahhahahaha',
 'ahhh',
 'ahhhh',
 'ahhhhhh',
 'ahhhhhhhhhhhhhh',
 'h',
 'ha',
 'haaa',
 'hah',
 'haha',
 'hahaaa',
 'hahah',
 'hahaha',
 'hahahaa',
 'hahahah',
 'hahahaha',
 'hahahahaaa',
 'hahahahahaha',
 'hahahahahahaha',
 'hahahahahahahahahahahahahahahaha',
 'hahahhahah',
 'hahhahahaha']

In [0]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [0]:
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
# ['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5',
# '0.50', '0.54', '0.56', '0.60', '0.7', '0.82', '0.84', '0.9', '0.95', '0.99',
# '1.01', '1.1', '1.125', '1.14', '1.1650', '1.17', '1.18', '1.19', '1.2', ...]

['0.0085',
 '0.05',
 '0.1',
 '0.16',
 '0.2',
 '0.25',
 '0.28',
 '0.3',
 '0.4',
 '0.5',
 '0.50',
 '0.54',
 '0.56',
 '0.60',
 '0.7',
 '0.82',
 '0.84',
 '0.9',
 '0.95',
 '0.99',
 '1.01',
 '1.1',
 '1.125',
 '1.14',
 '1.1650',
 '1.17',
 '1.18',
 '1.19',
 '1.2',
 '1.20',
 '1.24',
 '1.25',
 '1.26',
 '1.28',
 '1.35',
 '1.39',
 '1.4',
 '1.457',
 '1.46',
 '1.49',
 '1.5',
 '1.50',
 '1.55',
 '1.56',
 '1.5755',
 '1.5805',
 '1.6',
 '1.61',
 '1.637',
 '1.64',
 '1.65',
 '1.7',
 '1.75',
 '1.76',
 '1.8',
 '1.82',
 '1.8415',
 '1.85',
 '1.8500',
 '1.9',
 '1.916',
 '1.92',
 '10.19',
 '10.2',
 '10.5',
 '107.03',
 '107.9',
 '109.73',
 '11.10',
 '11.5',
 '11.57',
 '11.6',
 '11.72',
 '11.95',
 '112.9',
 '113.2',
 '116.3',
 '116.4',
 '116.7',
 '116.9',
 '118.6',
 '12.09',
 '12.5',
 '12.52',
 '12.68',
 '12.7',
 '12.82',
 '12.97',
 '120.7',
 '1206.26',
 '121.6',
 '126.1',
 '126.15',
 '127.03',
 '129.91',
 '13.1',
 '13.15',
 '13.5',
 '13.50',
 '13.625',
 '13.65',
 '13.73',
 '13.8',
 '13.90',
 '130.6',
 '130.7',
 '

In [0]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]
# ['C$', 'US$']

['C$', 'US$']

In [0]:
[w for w in wsj if re.search('^[0-9]{4}$', w)]
# ['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', ...]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934',
 '1948',
 '1953',
 '1955',
 '1956',
 '1961',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1975',
 '1976',
 '1977',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2005',
 '2009',
 '2017',
 '2019',
 '2029',
 '3057',
 '8300']

In [0]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]
# ['10-day', '10-lap', '10-year', '100-share', '12-point', '12-year', ...]

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point',
 '20-point',
 '20-stock',
 '21-month',
 '237-seat',
 '240-page',
 '27-year',
 '30-day',
 '30-point',
 '30-share',
 '30-year',
 '300-day',
 '36-day',
 '36-store',
 '42-year',
 '50-state',
 '500-stock',
 '52-week',
 '69-point',
 '84-month',
 '87-store',
 '90-day']

In [0]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
# ['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting',
# 'savings-and-loan']

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [0]:
[w for w in wsj if re.search('(ed|ing)$', w)]
# ['62%-owned', 'Absorbed', 'According', 'Adopting', 'Advanced', 'Advancing', ...]

['62%-owned',
 'Absorbed',
 'According',
 'Adopting',
 'Advanced',
 'Advancing',
 'Alfred',
 'Allied',
 'Annualized',
 'Anything',
 'Arbitrage-related',
 'Arbitraging',
 'Asked',
 'Assuming',
 'Atlanta-based',
 'Baking',
 'Banking',
 'Beginning',
 'Beijing',
 'Being',
 'Bermuda-based',
 'Betting',
 'Boeing',
 'Broadcasting',
 'Bucking',
 'Buying',
 'Calif.-based',
 'Change-ringing',
 'Citing',
 'Concerned',
 'Confronted',
 'Conn.based',
 'Consolidated',
 'Continued',
 'Continuing',
 'Declining',
 'Defending',
 'Depending',
 'Designated',
 'Determining',
 'Developed',
 'Died',
 'During',
 'Encouraged',
 'Encouraging',
 'English-speaking',
 'Estimated',
 'Everything',
 'Excluding',
 'Exxon-owned',
 'Faulding',
 'Fed',
 'Feeding',
 'Filling',
 'Filmed',
 'Financing',
 'Following',
 'Founded',
 'Fracturing',
 'Francisco-based',
 'Fred',
 'Funded',
 'Funding',
 'Generalized',
 'Germany-based',
 'Getting',
 'Guaranteed',
 'Having',
 'Heating',
 'Heightened',
 'Holding',
 'Housing',
 'Illumin

## 3.5 Useful Applications of Regular Expressions

### Extracting Word Pieces

In [0]:
import re

In [0]:
word = 'supercalifragilisticexpialidocious'

In [0]:
re.findall('[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [0]:
len(re.findall('[aeiou]', word))

16

In [0]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [0]:
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall('[aeiou]{2,}', word))

In [0]:
fd.items()

dict_items([('ea', 476), ('oi', 65), ('ou', 329), ('io', 549), ('ee', 217), ('ie', 331), ('ui', 95), ('ua', 109), ('ai', 261), ('ue', 105), ('ia', 253), ('ei', 86), ('iai', 1), ('oo', 174), ('au', 106), ('eau', 10), ('oa', 59), ('oei', 1), ('oe', 15), ('eo', 39), ('uu', 1), ('eu', 18), ('iu', 14), ('aii', 1), ('aiia', 1), ('ae', 11), ('aa', 3), ('oui', 6), ('ieu', 3), ('ao', 6), ('iou', 27), ('uee', 4), ('eou', 5), ('aia', 1), ('uie', 3), ('iao', 1), ('eei', 2), ('uo', 8), ('uou', 5), ('eea', 1), ('ueui', 1), ('ioa', 1), ('ooi', 1)])

In [0]:
[int(n) for n in re.findall('\d+', '2009-12-31')]

[2009, 12, 31]

### Doing More with Word Pieces

In [0]:
regexp = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

In [0]:
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [0]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')

In [0]:
nltk.tokenwrap(compress(w) for w in english_udhr[:75])

'Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and'

In [0]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

In [0]:
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [0]:
cfd = nltk.ConditionalFreqDist(cvs)

In [0]:
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [0]:
cv_word_pairs = [
    (cv, w) for w in rotokas_words
    for cv in re.findall(r'[ptksvr][aeiou]', w)
]

In [0]:
cv_index = nltk.Index(cv_word_pairs)

In [0]:
cv_index['su']

['kasuari']

In [0]:
cv_index['po']

['kaapo',
 'kaapopato',
 'kaipori',
 'kaiporipie',
 'kaiporivira',
 'kapo',
 'kapoa',
 'kapokao',
 'kapokapo',
 'kapokapo',
 'kapokapoa',
 'kapokapoa',
 'kapokapora',
 'kapokapora',
 'kapokaporo',
 'kapokaporo',
 'kapokari',
 'kapokarito',
 'kapokoa',
 'kapoo',
 'kapooto',
 'kapoovira',
 'kapopaa',
 'kaporo',
 'kaporo',
 'kaporopa',
 'kaporoto',
 'kapoto',
 'karokaropo',
 'karopo',
 'kepo',
 'kepoi',
 'keposi',
 'kepoto']

### Finding Word Stems

In [0]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

In [0]:
re.findall('^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['ing']

In [0]:
re.findall('^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['processing']

In [0]:
re.findall('^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

In [0]:
re.findall('^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('processe', 's')]

In [0]:
re.findall('^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

In [0]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

[('language', '')]

In [0]:
def stem(word):
    regexp = '^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [0]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [0]:
tokens = nltk.word_tokenize(raw)

In [0]:
[stem(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

### Searching Tokenized Text

In [0]:
from nltk.corpus import gutenberg, nps_chat

In [0]:
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

In [0]:
moby.findall("<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [0]:
chat = nltk.Text(nps_chat.words())
chat.findall("<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [0]:
chat.findall("<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [0]:
nltk.re_show('\d+', 'aaa-bbbb, 1234, oooo0')

aaa-bbbb, {1234}, oooo{0}


In [0]:
nltk.app.nemo()

In [0]:
from nltk.corpus import brown

In [0]:
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall("<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [0]:
hobbies_learned.findall("<as> <\w*> <as> <\w*s>")

as coppery as Delawares; as well as injuries; as much as was; as well
as personalities; as much as glass; as importantly as his


## 3.6 Normalizing Text

In [0]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)

### Stemmers

In [0]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

Observe that the Porter stemmer correctly handles the word lying (mapping it to lie), whereas the Lancaster stemmer does not.

In [0]:
[porter.stem(t) for t in tokens]

['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandat',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcic',
 'aquat',
 'ceremoni',
 '.']

In [0]:
[lancaster.stem(t) for t in tokens]

['den',
 ':',
 'list',
 ',',
 'strange',
 'wom',
 'lying',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'bas',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'pow',
 'der',
 'from',
 'a',
 'mand',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'som',
 'farc',
 'aqu',
 'ceremony',
 '.']

The Porter Stemmer is a good choice if you are
indexing some texts and want to support search using alternative forms of words

In [0]:
class IndexedText(object):
    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index(
            (self._stem(word), i)
            for (i, word) in enumerate(text)
        )

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = width // 4  # words of context
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            ldisplay = '%*s' % (width, lcontext[-width:])
            rdisplay = '%-*s' % (width, rcontext[:width])
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()

In [0]:
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie')

r king ! DENNIS : Listen , strange women lying in ponds distributing swords is no
 beat a very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of
       Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded !   
doctors immediately ! No , no , please ! Lie down . [ clap clap ] PIGLET : Well  
ere is much danger , for beyond the cave lies the Gorge of Eternal Peril , which 
   you . Oh ... TIM : To the north there lies a cave -- the cave of Caerbannog --
h it and lived ! Bones of full fifty men lie strewn about its lair . So , brave k
not stop our fight ' til each one of you lies dead , and the Holy Grail returns t


### Lemmatization

In [0]:
wnl = nltk.WordNetLemmatizer()

In [0]:
[wnl.lemmatize(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing',
 'sword',
 'is',
 'no',
 'basis',
 'for',
 'a',
 'system',
 'of',
 'government',
 '.',
 'Supreme',
 'executive',
 'power',
 'derives',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

The WordNet lemmatizer is a good choice if you want to compile the vocabulary of
some texts and want a list of valid lemmas (or lexicon headwords).

## 3.7 Regular Expressions for Tokenizing Text

### Simple Approaches to Tokenization

In [0]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [0]:
re.split(' ', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone\nthough),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very\nwell',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [0]:
re.split('[ \t\n]+', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [0]:
re.split('\s+', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [0]:
re.split('\W+', raw)

['',
 'When',
 'I',
 'M',
 'a',
 'Duchess',
 'she',
 'said',
 'to',
 'herself',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 'I',
 'won',
 't',
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'Maybe',
 'it',
 's',
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 'tempered',
 '']

In [0]:
'xx'.split('x')

['', '', '']

In [0]:
re.findall('\w+', raw)

['When',
 'I',
 'M',
 'a',
 'Duchess',
 'she',
 'said',
 'to',
 'herself',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 'I',
 'won',
 't',
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'Maybe',
 'it',
 's',
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 'tempered']

In [0]:
re.findall('\w+|\S\w*', raw)

["'When",
 'I',
 "'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'I",
 'won',
 "'t",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 '-',
 '-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 '-tempered',
 ',',
 "'",
 '.',
 '.',
 '.']

In [0]:
raw

"'When I'M a Duchess,' she said to herself, (not in a very hopeful tone\nthough), 'I won't have any pepper in my kitchen AT ALL. Soup does very\nwell without--Maybe it's always pepper that makes people hot-tempered,'..."

In [0]:
re.findall("\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)

["'",
 'When',
 "I'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'",
 'I',
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 '--',
 'Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot-tempered',
 ',',
 "'",
 '...']

### NLTK’s Regular Expression Tokenizer

In [0]:
text = 'That U.S.A. poster-print costs $12.40...'
pattern = re.compile(
    '''(?x)                 # set flag to allow verbose regexps
    (?:[A-Z]\.)+            # abbreviations, e.g. U.S.A.
    | \w'                   # single-letter elided pronoun or determiner with final apostrophe
    | \w+(?:-\w+)*          # words with optional internal hyphens
    | \$?\d+(?:\.\d+)?%?    # currency and percentages, e.g. $12.40, 82%
    | \.\.\.                # ellipsis
    | [][.,;"'?():_`\|\n-]  # these are separate tokens; includes ], [, -
    '''
)
nltk.regexp_tokenize(text, pattern)

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']

### Further Issues with Tokenization

## 3.8 Segmentation

### Sentence Segmentation

In [0]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

20.250994070456922

In [0]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])

['In the wild events which were to follow this girl had no\n'
 'part at all; he never saw her again until all his tale was over.',
 'And yet, in some indescribable way, she kept recurring like a\n'
 'motive in music through all his mad adventures afterwards, and the\n'
 'glory of her strange hair ran like a red thread through those dark\n'
 'and ill-drawn tapestries of the night.',
 'For what followed was so\nimprobable, that it might well have been a dream.',
 'When Syme went out into the starlit street, he found it for the\n'
 'moment empty.',
 'Then he realised (in some odd way) that the silence\n'
 'was rather a living silence than a dead one.',
 'Directly outside the\n'
 'door stood a street lamp, whose gleam gilded the leaves of the tree\n'
 'that bent out over the fence behind him.',
 'About a foot from the\n'
 'lamp-post stood a figure almost as rigid and motionless as the\n'
 'lamp-post itself.',
 'The tall hat and long frock coat were black; the\n'
 'face, in an abrupt shadow

### Word Segmentation

In [0]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [0]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i + 1
    words.append(text[last:])
    return words

In [0]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [0]:
segment(text, seg1)

['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']

In [0]:
segment(text, seg2)

['do',
 'you',
 'see',
 'the',
 'kitty',
 'see',
 'the',
 'doggy',
 'do',
 'you',
 'like',
 'the',
 'kitty',
 'like',
 'the',
 'doggy']

In [0]:
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size

In [0]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
segment(text, seg3)

['doyou',
 'see',
 'thekitt',
 'y',
 'see',
 'thedogg',
 'y',
 'doyou',
 'like',
 'thekitt',
 'y',
 'like',
 'thedogg',
 'y']

In [0]:
evaluate(text, seg3)

46

In [0]:
evaluate(text, seg2)

47

In [0]:
evaluate(text, seg1)

63

In [0]:
from random import randint


def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]


def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs


def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs

In [0]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
60 ['do', 'youse', 'ethekitty', 's', 'e', 'e', 'the', 'doggy', 'do', 'youlik', 'ethekitty', 'likethe', 'doggy']
60 ['do', 'youse', 'ethekitty', 's', 'e', 'e', 'the', 'doggy', 'do', 'youlik', 'ethekitty', 'likethe', 'doggy']
60 ['do', 'youse', 'ethekitty', 's', 'e', 'e', 'the', 'doggy', 'do', 'youlik', 'ethekitty', 'likethe', 'doggy']
60 ['do', 'youse', 'ethekitty', 's', 'e', 'e', 'the', 'doggy', 'do', 'youlik', 'ethekitty', 'likethe', 'doggy']
59 ['do', 'you', 'se

'0000101000100001010001000010000100100010000100100010000'

## 3.9 Formatting: From Lists to Strings

### From Lists to Strings

In [0]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']

In [0]:
' '.join(silly)

'We called him Tortoise because he taught us .'

In [0]:
';'.join(silly)

'We;called;him;Tortoise;because;he;taught;us;.'

In [0]:
''.join(silly)

'WecalledhimTortoisebecausehetaughtus.'

### Strings and Formats

Python 3 new style https://pyformat.info/

In [0]:
word = 'cat'
sentence = """hello
world"""

In [0]:
print(word)

cat


In [0]:
print(sentence)

hello
world


In [0]:
word

'cat'

In [0]:
sentence

'hello\nworld'

In [0]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])

In [0]:
for word in fdist:
    print(word, '->', fdist[word], ';')

dog -> 4 ;
cat -> 3 ;
snake -> 1 ;


In [0]:
for word in fdist:
    print('%s->%d;' % (word, fdist[word]))

dog->4;
cat->3;
snake->1;


In [0]:
'%s->%d;' % ('cat', 3)

'cat->3;'

In [0]:
'%s->%d;' % 'cat'

TypeError: not enough arguments for format string

In [0]:
'%s->' % 'cat'

'cat->'

In [0]:
'%d' % 3

'3'

In [0]:
'I want a %s right now' % 'coffee'

'I want a coffee right now'

In [0]:
"%s wants a %s %s" % ("Lee", "sandwich", "for lunch")

'Lee wants a sandwich for lunch'

In [0]:
template = 'Lee wants a %s right now'
menu = ['sandwich', 'spam fritter', 'pancake']
for snack in menu:
    print(template % snack)

Lee wants a sandwich right now
Lee wants a spam fritter right now
Lee wants a pancake right now


### Lining Things Up

In [0]:
'%6s' % 'dog'

'   dog'

In [0]:
'%-6s' % 'dog'

'dog   '

In [0]:
width = 6
'%-*s' % (width, 'dog')

'dog   '

In [0]:
count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)

'accuracy for 9375 words: 34.1867%'

In [0]:
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end=' ')
    for word in words:  # column headings
        print('%6s' % word, end=' ')
    print()
    for category in categories:
        print('%-16s' % category, end=' ')  # row heading
        for word in words:  # for each word
            print('%6d' % cfdist[category][word], end=' ')  # print table cell
        print()

In [0]:
from nltk.corpus import brown


cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

In [0]:
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

Category            can  could    may  might   must   will 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
hobbies             268     58    131     22     83    264 
science_fiction      16     49      4     12      8     16 
romance              74    193     11     51     45     43 
humor                16     30      8      8      9     13 


In [0]:
'%*s' % (15, "Monty Python")

'   Monty Python'

### Writing Results to a File

In [0]:
with open('output.txt', 'w') as output_file:
    words = set(nltk.corpus.genesis.words('english-kjv.txt'))
    for word in sorted(words):
        output_file.write(word + '\n')
    print(len(words))
    output_file.write(str(len(words)) + "\n")

2789


### Text Wrapping

In [0]:
saying = ['After', 'all', 'is', 'said', 'and', 'done',
          ',', 'more', 'is', 'said', 'than', 'done', '.']
for word in saying:
    print(word, '(' + str(len(word)) + '),', end=' ')

After (5), all (3), is (2), said (4), and (3), done (4), , (1), more (4), is (2), said (4), than (4), done (4), . (1), 

In [0]:
from textwrap import fill


format_string = '%s_(%d),'
pieces = [format_string % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)

In [0]:
print(wrapped)

After_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),
more_(4), is_(2), said_(4), than_(4), done_(4), ._(1),


In [0]:
print(wrapped.replace('_', ' '))

After (5), all (3), is (2), said (4), and (3), done (4), , (1),
more (4), is (2), said (4), than (4), done (4), . (1),
