In [46]:
from nltk.corpus import gutenberg
from collections import Counter

In [47]:
training_set = gutenberg.fileids().copy()
test_set = 'austen-persuasion.txt'
training_set.remove(test_set)

In [48]:
print('training_set:', training_set)
print('test_set:', test_set)

training_set: ['austen-emma.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
test_set: austen-persuasion.txt


In [49]:
sents = gutenberg.sents(training_set)
words = gutenberg.words(training_set)

In [50]:
print('sents:', len(sents), 'words:', len(words), 'avg length:', len(words)/len(sents))

sents: 94805 words: 2523442 avg length: 26.617182638046515


In [51]:
unigram = Counter(words)
print('words:', len(words), 'vocabulary:', len(unigram))

words: 2523442 vocabulary: 50738


In [52]:
unigram.most_common()[:20]

[(',', 179341),
 ('the', 122628),
 ('and', 76107),
 ('.', 71005),
 ('of', 67514),
 (':', 47282),
 ('to', 43668),
 ('a', 30975),
 ('in', 30613),
 ('I', 29097),
 ('that', 26436),
 (';', 26039),
 ('he', 21462),
 ('his', 19960),
 ("'", 19348),
 ('it', 18877),
 ('was', 17228),
 ('And', 16471),
 ('with', 16184),
 ('for', 16165)]

In [98]:
import string
punct = string.punctuation
print('punctuation:', punct)

unk_list = [w for w, c in unigram.items() if c == 1]
print('uncommon word:', unk_list[:10])

punctuation: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
uncommon word: ['1816', 'valetudinarian', 'Matrimony', 'chatted', 'curtseys', 'bangs', 'Dirty', 'drizzle', 'Mitchell', 'Success']


In [131]:
unk_list = set(unk_list)
def preprocessing(sents):
    for c, sent in enumerate(sents):
        if c % 10000 == 0:
            print(c, end=' ')
        for i, word in enumerate(sent):
            if word in punct:
                sent.remove(word)
            elif word.isdigit():
                sent[i] = '_NUM_'
            elif word in unk_list:
                sent[i] = '_UNK_'
        sent.insert(0, '<s>')
        sent.append('</s>')
sample_sents = list(sents[:5]).copy()
preprocessing(sample_sents)
print(sample_sents)

0 [['<s>', 'Emma', 'by', 'Jane', 'Austen', '_NUM_', '</s>'], ['<s>', 'VOLUME', 'I', '</s>'], ['<s>', 'CHAPTER', 'I', '</s>'], ['<s>', 'Emma', 'Woodhouse', 'handsome', 'clever', 'and', 'rich', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', 'and', 'had', 'lived', 'nearly', 'twenty', 'one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '</s>'], ['<s>', 'She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', 'indulgent', 'father', 'and', 'had', 'in', 'consequence', 'of', 'her', 'sister', 's', 'marriage', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '</s>']]


## make n-gram

In [132]:
preprocessed_sents = list(sents).copy()
preprocessing(preprocessed_sents)

0 10000 20000 30000 40000 50000 60000 70000 80000 90000 

In [142]:
def make_ngram(sents, n, container):
    for i, sent in enumerate(sents):
        for i in range(0, len(sent)-n+1):
            gram = ' '.join(sent[i:i+n])
            container.setdefault(gram, 0)
            container[gram] += 1

In [148]:
unigram = {}
make_ngram(preprocessed_sents, 1, unigram)
print('unigram: ', list(unigram.items())[:10])

bigram = {}
make_ngram(preprocessed_sents, 2, bigram)
print('bigram: ', list(bigram.items())[:10])

trigram = {}
make_ngram(preprocessed_sents, 3, trigram)
print('trigram: ', list(trigram.items())[:10])

unigram:  [('<s>', 94805), ('Emma', 865), ('by', 7601), ('Jane', 302), ('Austen', 2), ('_NUM_', 26919), ('</s>', 94805), ('VOLUME', 3), ('I', 29097), ('CHAPTER', 291)]
bigram:  [('<s> Emma', 223), ('Emma by', 2), ('by Jane', 3), ('Jane Austen', 2), ('Austen _NUM_', 2), ('_NUM_ </s>', 294), ('<s> VOLUME', 3), ('VOLUME I', 1), ('I </s>', 117), ('<s> CHAPTER', 276)]
trigram:  [('<s> Emma by', 1), ('Emma by Jane', 1), ('by Jane Austen', 2), ('Jane Austen _NUM_', 2), ('Austen _NUM_ </s>', 2), ('<s> VOLUME I', 1), ('VOLUME I </s>', 1), ('<s> CHAPTER I', 8), ('CHAPTER I </s>', 10), ('<s> Emma Woodhouse', 1)]


In [153]:
def write_model(path, container):
    with open(path, 'w') as out:
        out.write('\n'.join([gram + '\t' + str(count) for gram, count in container.items()]))

write_model('./unigram.txt', unigram)
write_model('./bigram.txt', bigram)
write_model('./trigram.txt', trigram)

In [8]:
%%latex
\begin{align}
P(w_n|w_1 \cdots w_{n-1}) = \frac{P(w_1 \cdots w_n)}{P(w_1 \cdots w_{n-1})}
\end{align}

<IPython.core.display.Latex object>

In [158]:
sents[0]

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']']