# Implementations

In [1]:
import numpy as np
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/ozgurozdemir/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

## Downloading Hamlet Corpus

In [2]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')

In [3]:
print(hamlet[:10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']


In [4]:
print(f'# of tokens: {len(hamlet)}')
print(f'# of unique words: {len(set(hamlet))}')

# of tokens: 37360
# of unique words: 5447


In [5]:
hamlet = [w.lower() for w in hamlet]

In [6]:
print(f'# of tokens: {len(hamlet)}')
print(f'# of unique words: {len(set(hamlet))}')

# of tokens: 37360
# of unique words: 4716


# Creating Bigrams

In [7]:
print('\t'.join(hamlet[:10]))
print('\t'.join(hamlet[1:10]))

[	the	tragedie	of	hamlet	by	william	shakespeare	1599	]
the	tragedie	of	hamlet	by	william	shakespeare	1599	]


In [8]:
bigrams= list(zip(hamlet, hamlet[1:]))
print(f'# of bigrams: {len(bigrams)}')

# of bigrams: 37359


In [9]:
print(bigrams[:5])

[('[', 'the'), ('the', 'tragedie'), ('tragedie', 'of'), ('of', 'hamlet'), ('hamlet', 'by')]


In [10]:
def prepare_bigrams(corpus):
    return list(zip(corpus, corpus[1:]))

## Creating Bigram Language Model

In [11]:
fd = nltk.FreqDist(bigrams)
print(fd.most_common(10))

[((',', 'and'), 466), (('ham', '.'), 337), (("'", 'd'), 223), (('my', 'lord'), 175), (('.', 'i'), 151), ((',', 'that'), 136), (("'", 's'), 122), ((',', 'i'), 105), (('king', '.'), 96), (('hor', '.'), 95)]


In [12]:
example_bigram = ('my', 'lord')
print(f'Frequency of {example_bigram}: {fd.freq(example_bigram)}')

Frequency of ('my', 'lord'): 0.00468427955780401


## Estimating Probabilities

In [13]:
def estimate_probability(sentence, fd):
    sentence_bigrams = prepare_bigrams(sentence)
    probs = [fd.freq(b) for b in sentence_bigrams]
    return np.prod(probs)

In [14]:
estimate_probability('and you from england'.split(), fd)

4.986405805488382e-12

In [15]:
test = ['and you from england', 'order that these bodies', 
        'polake warres , and you', 'are heere arriued .',
        'these bodies high on a stage']
test.sort(key=lambda sentence: estimate_probability(sentence.split(), fd), reverse=True)

In [16]:
for i, sen in enumerate(test):
    print(f"{i+1}. \'{sen}\': {estimate_probability(sen.split(), fd)}")

1. 'and you from england': 4.986405805488382e-12
2. 'order that these bodies': 1.9178483867263006e-14
3. 'are heere arriued .': 1.9178483867263006e-14
4. 'polake warres , and you': 3.1099134149168685e-15
5. 'these bodies high on a stage': 6.870586160465759e-23


## Generating Text

In [17]:
cfd = nltk.ConditionalFreqDist(bigrams)

In [18]:
cfd['horatio']

FreqDist({')': 1,
          ',': 17,
          '.': 5,
          ':': 2,
          '?': 3,
          'a': 1,
          'and': 3,
          'barn': 1,
          'ham': 1,
          'hor': 1,
          'hora': 1,
          'saies': 1,
          'tell': 1,
          'there': 1,
          'wait': 1})

In [19]:
cfd['tragedie'].max()

'of'

In [20]:
token = 'horatio'
generated_text = token

length = 15
for i in range(length):
    most_prob = cfd[token].max()
    generated_text += ' ' + most_prob
    token = most_prob

In [21]:
print(generated_text)

horatio , and the king . i haue you , and the king . i haue
