Natural Language Processing with Python – Analyzing Text with the Natural Language Toolkit
Steven Bird, Ewan Klein, and Edward Loper
http://www.nltk.org/book/

# Chapter 3. Processing Raw Text

In [17]:
%matplotlib inline

import nltk, re, pprint
from nltk import word_tokenize

## 3.9 Formatting: From Lists to Strings

### From Lists to Strings

In [2]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']

In [3]:
' '.join(silly)

'We called him Tortoise because he taught us .'

In [4]:
';'.join(silly)

'We;called;him;Tortoise;because;he;taught;us;.'

In [5]:
''.join(silly)

'WecalledhimTortoisebecausehetaughtus.'

### Strings and Formats

In [10]:
word = 'cat'
sentence = """hello
world"""

In [11]:
print(word)

cat


In [12]:
print(sentence)

hello
world


In [14]:
word

'cat'

In [15]:
sentence

'hello\nworld'

In [18]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])

In [19]:
for word in fdist:
    print(word, '->', fdist[word], ';')

dog -> 4 ;
cat -> 3 ;
snake -> 1 ;


In [21]:
for word in fdist:
    print('%s->%d;' % (word, fdist[word]))

dog->4;
cat->3;
snake->1;


In [22]:
'%s->%d;' % ('cat', 3)

'cat->3;'

In [23]:
'%s->%d;' % 'cat'

TypeError: not enough arguments for format string

In [24]:
'%s->' % 'cat'

'cat->'

In [25]:
'%d' % 3

'3'

In [26]:
'I want a %s right now' % 'coffee'

'I want a coffee right now'

In [27]:
"%s wants a %s %s" % ("Lee", "sandwich", "for lunch")

'Lee wants a sandwich for lunch'

In [29]:
template = 'Lee wants a %s right now'
menu = ['sandwich', 'spam fritter', 'pancake']
for snack in menu:
    print(template % snack)

Lee wants a sandwich right now
Lee wants a spam fritter right now
Lee wants a pancake right now


### Lining Things Up

In [30]:
'%6s' % 'dog'

'   dog'

In [31]:
'%-6s' % 'dog'

'dog   '

In [32]:
width = 6
'%-*s' % (width, 'dog')

'dog   '

In [33]:
count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)

'accuracy for 9375 words: 34.1867%'

In [47]:
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end=' ')
    for word in words:  # column headings
        print('%6s' % word, end=' ')
    print()
    for category in categories:
        print('%-16s' % category, end=' ')  # row heading
        for word in words:  # for each word
            print('%6d' % cfdist[category][word], end=' ')  # print table cell
        print()

In [41]:
from nltk.corpus import brown


cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

In [48]:
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

Category            can  could    may  might   must   will 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
hobbies             268     58    131     22     83    264 
science_fiction      16     49      4     12      8     16 
romance              74    193     11     51     45     43 
humor                16     30      8      8      9     13 


In [49]:
'%*s' % (15, "Monty Python")

'   Monty Python'

### Writing Results to a File

In [68]:
with open('output.txt', 'w') as output_file:
    words = set(nltk.corpus.genesis.words('english-kjv.txt'))
    for word in sorted(words):
        output_file.write(word + '\n')
    print(len(words))
    output_file.write(str(len(words)) + "\n")

2789


### Text Wrapping

In [71]:
saying = ['After', 'all', 'is', 'said', 'and', 'done',
          ',', 'more', 'is', 'said', 'than', 'done', '.']
for word in saying:
    print(word, '(' + str(len(word)) + '),', end=' ')

After (5), all (3), is (2), said (4), and (3), done (4), , (1), more (4), is (2), said (4), than (4), done (4), . (1), 

In [80]:
from textwrap import fill


format_string = '%s_(%d),'
pieces = [format_string % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)

In [81]:
print(wrapped)

After_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),
more_(4), is_(2), said_(4), than_(4), done_(4), ._(1),


In [82]:
print(wrapped.replace('_', ' '))

After (5), all (3), is (2), said (4), and (3), done (4), , (1),
more (4), is (2), said (4), than (4), done (4), . (1),
