In [1]:
import numpy as np
import nltk
nltk.download('gutenberg')
from collections import defaultdict

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\2019c\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


In [2]:
# list all of the corpora from project Gutenberg in NLTK
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

As we can see, the gutenberg module allows easy access to a subset of all the books available on the website. In this example, weâ€™ll focus on shakespeare-caesar, shakespeare-hamlet, and shakespeare-macbeth. The plays come as .txt files, they can be read with the nltk.corpus.gutenberg.words function which returns them already tokenized.

In [3]:
shakespeare_corpora = [i for i in nltk.corpus.gutenberg.fileids() if i.startswith('shakespeare')]
shakespeare_corpora

['shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']

In [10]:
# get all the corpora
corpora = {
    corpus_name: nltk.corpus.gutenberg.words(corpus_name)
    for corpus_name in shakespeare_corpora
}

# print some sentences from a corpus
words = corpora[shakespeare_corpora[0]][:100]
" ".join(words)

'[ The Tragedie of Julius Caesar by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Flauius , Murellus , and certaine Commoners ouer the Stage . Flauius . Hence : home you idle Creatures , get you home : Is this a Holiday ? What , know you not ( Being Mechanicall ) you ought not walke Vpon a labouring day , without the signe Of your Profession ? Speake , what Trade art thou ? Car . Why Sir , a Carpenter Mur . Where is thy Leather Apron , and thy Rule ? What dost'

In [11]:
corpora.values()

dict_values([['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', ...], ['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...], ['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', ...]])

In [17]:
#count how many times a specific token is right after another token in a corpora

# example = {
#     'the': {'king': 10, 'queen': 5,},
#     'a': {'king': 2, 'queen': 8,}
# }

counts = {}
for corpus in corpora.values():
    for i in range(len(corpus)-1):
        token = corpus[i].lower()
        next_token = corpus[i+1].lower()
        if token not in counts:
            counts[token] = {}
        if next_token not in counts[token]:
            counts[token][next_token] = 0
        counts[token][next_token] += 1

In [22]:
list(counts['from'].items())[:10]

[('the', 37),
 ('caesars', 1),
 ('your', 5),
 ('their', 3),
 ('brutus', 1),
 ('that', 2),
 ('seuerall', 1),
 ('qualitie', 1),
 ('bondage', 1),
 ('power', 1)]

In [23]:
# so here we can see that the token "from" is followed by "the" token 37 times and so on.

In [24]:
for token in counts:
    total_count = sum(counts[token].values())
    for next_token in counts[token]:
        counts[token][next_token] /= total_count

In [44]:
#print 10 examples of "from" token and their next token probabilities 
probs_From= list(counts["from"].items())
probs_From.sort(key=lambda x: x[1], reverse=True)
probs_From[:10]

[('the', 0.19170984455958548),
 ('his', 0.06217616580310881),
 ('her', 0.05181347150259067),
 ('my', 0.046632124352331605),
 ('this', 0.046632124352331605),
 ('our', 0.04145077720207254),
 ('me', 0.04145077720207254),
 ('a', 0.031088082901554404),
 ('your', 0.025906735751295335),
 ('him', 0.025906735751295335)]

In [None]:
# now we need to create a function that will generate text based on the probabilities we have calculated above.
def generate_text(start_token, length=20):
    current_token = start_token.lower()
    output = [current_token]
    for _ in range(length-1):
        if current_token not in counts:
            break
        next_tokens = list(counts[current_token].keys())
        next_probs = list(counts[current_token].values())
        # np.random.seed(42) # this will produce the same output every time 
        current_token = np.random.choice(next_tokens, size=1, p=next_probs)[0]
        output.append(current_token)
    return " ".join(output)

In [57]:
#lets generate some text
generate_text("from", length=150)

'from a souldier , to his brutus . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you cask . and you'

# Drwaback of the N-gram for text generation

In [63]:
whole_corpus = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt') + nltk.corpus.gutenberg.words('shakespeare-macbeth.txt') + nltk.corpus.gutenberg.words('shakespeare-caesar.txt')
lowered_corpus = [i.lower() for i in whole_corpus]
whole_string = " ".join(lowered_corpus)

In [70]:
whole_string.count("from ")

193

In [71]:
whole_string.count("from the ")

37

In [72]:
whole_string.count("from the streets ")

1