# Assignment 1: Building a Shakespearean Text Generator

In [1]:
import nltk
nltk.download('gutenberg')
nltk.download('brown')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/ricardomendezcavalieri/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/ricardomendezcavalieri/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

##  Data preparation

In [2]:
# Load All Shakespeare's Works

from nltk.corpus import gutenberg
from collections import defaultdict

caesar = gutenberg.words('shakespeare-caesar.txt')
hamlet = gutenberg.words('shakespeare-hamlet.txt')
macbeth = gutenberg.words('shakespeare-macbeth.txt')

# Lowercase all words, remove punctuation

caesar = [word.lower() for word in caesar if word.isalnum()]
hamlet = [word.lower() for word in hamlet if word.isalnum()]
macbeth = [word.lower() for word in macbeth if word.isalnum()]

# Tokenize the words

caesar = nltk.Text(caesar)
hamlet = nltk.Text(hamlet)
macbeth = nltk.Text(macbeth)

print(caesar.tokens[:10])

# Bigram list

caesar_bigrams = list(nltk.bigrams(caesar))
hamlet_bigrams = list(nltk.bigrams(hamlet))
macbeth_bigrams = list(nltk.bigrams(macbeth))

from_bigram_to_next_token_counts = defaultdict(lambda: defaultdict(int))


# Build dictionary of bigrams to next token counts

for i in range(len(caesar_bigrams) - 1):
    bigram = caesar_bigrams[i]
    next_word = caesar_bigrams[i + 1][1]
    from_bigram_to_next_token_counts[bigram][next_word] += 1

for i in range(len(hamlet_bigrams) - 1):
    bigram = hamlet_bigrams[i]
    next_word = hamlet_bigrams[i + 1][1]
    from_bigram_to_next_token_counts[bigram][next_word] += 1

for i in range(len(macbeth_bigrams) - 1):
    bigram = macbeth_bigrams[i]
    next_word = macbeth_bigrams[i + 1][1]
    from_bigram_to_next_token_counts[bigram][next_word] += 1

['the', 'tragedie', 'of', 'julius', 'caesar', 'by', 'william', 'shakespeare', '1599', 'actus']


## Probability Distribution

In [3]:
# Calculate the probabilitiy of the next word given a bigram

# Given a bigram, the probability of the next word is the frequency of the next word
# over the total frequency of all next words for a given bigram

from_bigram_to_next_token_prob = defaultdict(lambda: defaultdict(float))

for bigram, next_word_counts in from_bigram_to_next_token_counts.items():
    total = sum(next_word_counts.values())
    for next_word, count in next_word_counts.items():
        from_bigram_to_next_token_prob[bigram][next_word] = count / total

print(from_bigram_to_next_token_prob[('the', 'tragedie')])

defaultdict(<class 'float'>, {'of': 1.0})


## Sampling Next Token

In [4]:
import numpy as np

# Using np.random.choice, this function samples the next token given the list
# of possible next words (keys) and their probabilities (values)

def sample_next_token(ngram, prob_dict):
    if ngram in prob_dict:  # Avoid KeyError
        return str(np.random.choice(list(prob_dict[ngram].keys()), p=list(prob_dict[ngram].values())))
    return None  # Fallback case

sample_next_token(('the', 'people'), from_bigram_to_next_token_prob)

'fell'

## Generating Text

In [5]:
import random

# This function is a generalizable way of building  a sentence given a starting n_gram, the length of the sentence
# and the probability dictionary

# It basically just samples the next word given the current n_gram and appends it to the sentence
def build_sentence(start_ngram: tuple, length: int, ngram_prob: dict) -> list:
    sentence = list(start_ngram)

    for _ in range(length - len(start_ngram)):
        current_ngram = tuple(sentence[-(len(start_ngram)):])

        if current_ngram in ngram_prob:
            next_word = sample_next_token(current_ngram, ngram_prob)  # Using the function
            sentence.append(next_word)
        else:
            break

    return sentence



build_sentence(['be', 'or'], 50, from_bigram_to_next_token_prob)

['be',
 'or',
 'we',
 'will',
 'fetters',
 'put',
 'vpon',
 'this',
 'bloodie',
 'question',
 'you',
 'seeme',
 'to',
 'feare',
 'things',
 'that',
 'are',
 'married',
 'already',
 'all',
 'but',
 'metellus',
 'cymber',
 'brut',
 'they',
 'are',
 'vanished',
 'calp',
 'caesar',
 'i',
 'neuer',
 'gaue',
 'you',
 'ought',
 'of',
 'woe',
 'king',
 'tis',
 'deepely',
 'sworne',
 'sweet',
 'leaue',
 'me',
 'friends',
 'tis',
 'now',
 'strook',
 'twelue',
 'get',
 'thee']

## Different N-Gram Exploration

In [18]:
from nltk import ngrams
from nltk.corpus import brown

# This is a function that gets all the texts, makes them lowercase and removes punctuation, then tokenizes them.
# It then uses the ngrams function to get all the ngrams of a given length.

# It the builds a dictionary of ngrams, that maps a ngram to a dictionary of next words and their counts, which is returned.
# I also added a modern corpus to the mix, to see if the model can generate modern text in the same style as Shakespeare.

# It has an optional parameter text, which allows you to pass a custom text to generate ngrams from.
def get_n_grams(length, text=None):
    
      # Helper function that lowercases all words and removes punctuation
    def clean_corpus(corp):
        return [word.lower() for word in corp if word.isalnum()]
    
     # Helper function that processes the ngrams and builds the dictionary of frequencies
    def process_ngrams(ngram_list, length):
        for i in range(len(ngram_list) - length - 1):
            ngram = ngram_list[i]
            next_word = ngram_list[i+1][length-1]
            from_ngram_to_next_token_counts[ngram][next_word] += 1
    
    if text is None:
        caesar = gutenberg.words('shakespeare-caesar.txt')
        hamlet = gutenberg.words('shakespeare-hamlet.txt')
        macbeth = gutenberg.words('shakespeare-macbeth.txt')
        modern_corpus = brown.words(categories='news') + brown.words(categories='fiction')

        caesar, hamlet, macbeth, modern_corpus = clean_corpus(caesar), clean_corpus(hamlet), clean_corpus(macbeth), clean_corpus(modern_corpus)
        
        # Tokenize the words
        caesar = nltk.Text(caesar)
        hamlet = nltk.Text(hamlet)
        macbeth = nltk.Text(macbeth)
        modern = nltk.Text(modern_corpus)

        caesar_ngrams = list(ngrams(caesar, length))
        hamlet_ngrams = list(ngrams(hamlet, length))
        macbeth_ngrams = list(ngrams(macbeth, length))
        modern_ngrams = list(ngrams(modern, length))
    else:
        text = clean_corpus(text)
        text = nltk.Text(text)
        ngram_list = list(ngrams(text, length))

    from_ngram_to_next_token_counts = defaultdict(lambda: defaultdict(int))

   
    if text is None:
        process_ngrams(caesar_ngrams, length)
        process_ngrams(hamlet_ngrams, length)
        process_ngrams(macbeth_ngrams, length)
        process_ngrams(modern_ngrams, length)
    else:
        process_ngrams(ngram_list, length)

    return from_ngram_to_next_token_counts

# This function builds a new dictionary of probabilities given the counts of a dictionary of ngrams
def get_n_grams_prob(ngram_counts):
    from_bigram_to_next_token_prob = defaultdict(lambda: defaultdict(float))

    for ngram, next_word_counts in ngram_counts.items():
        total = sum(next_word_counts.values())
        for next_word, count in next_word_counts.items():
            from_bigram_to_next_token_prob[ngram][next_word] = count / total

    return from_bigram_to_next_token_prob


In [19]:
ngram_counts = get_n_grams(2)
ngram_probs = get_n_grams_prob(ngram_counts)


In [20]:
sentence = build_sentence(['you', 'are'], 50, ngram_probs)

In [21]:
print(sentence)

['you', 'are', 'not', 'going', 'to', 'try', 'to', 'exploit', 'the', 'land', 'of', 'heaven', 'haint', 'there', 'just', 'nothin', 'like', 'sweet', 'bels', 'iangled', 'out', 'of', 'character', 'but', 'he', 'parting', 'from', 'her', 'faire', 'iudgement', 'without', 'the', 'which', 'he', 'found', 'the', 'house', 'of', 'correction', 'the', 'jail', 'sentence', 'is', 'to', 'be', 'lost', 'he', 'stood', 'back', 'to']


# Tests

In [22]:
import unittest
from collections import defaultdict
from nltk.util import ngrams

mock_corpus = ["this", "is", "a", "test", "this", "is", "only", "a", "test"]

# This function builds a dictionary of ngrams to next word counts.
# I use it to test the function that builds the probabilities of the next word given a ngram.
def manual_ngram_counts(corpus, n):
    ngram_counts = defaultdict(lambda: defaultdict(int))
    ngram_list = list(ngrams(corpus, n))
    for i in range(len(ngram_list) - 1):
        ngram = ngram_list[i]
        next_word = ngram_list[i+1][-1]
        ngram_counts[ngram][next_word] += 1
    return ngram_counts

class TestNGramFunctions(unittest.TestCase):

    #  This function tests the get_n_grams function, and, more specifically, the type of the returned object.
    # It first tests that the frequency dictionary is a defaultdict, then it tests that the inner dictionaries are defaultdicts,
    # and finally it tests that the values of the inner dictionaries are integers.
    
    # It also tests that the function returns the correct frequency for a given ngram.
    def test_get_n_grams(self):
        length = 2
        ngram_counts = get_n_grams(length, text=mock_corpus)

        self.assertIsInstance(ngram_counts, defaultdict)
        for key, value in ngram_counts.items():
            self.assertIsInstance(value, defaultdict)
            self.assertIsInstance(next(iter(value.values()), 0), int)
        self.assertEqual(ngram_counts[('this', 'is')]['a'], 1)


    # This function tests the get_n_grams_prob function, and, more specifically, the type of the returned object.
    # It first tests that the probability dictionary is a defaultdict, then it tests that the inner dictionaries are defaultdicts,
    # and finally it tests that the values of the inner dictionaries are floats.
    
    # It also tests that the function returns the correct probability for a given ngram by checking that the sum of the values
    # of the inner dictionary is 1.
    def test_get_n_grams_prob(self):
        length = 2
        ngram_counts = manual_ngram_counts(mock_corpus, length)
        ngram_probs = get_n_grams_prob(ngram_counts)

        self.assertIsInstance(ngram_probs, defaultdict)
        for key, value in ngram_probs.items():
            self.assertIsInstance(value, defaultdict)
            self.assertIsInstance(next(iter(value.values()), 0.0), float)

        for key, value in ngram_probs.items():
            self.assertAlmostEqual(sum(value.values()), 1.0, places=5)

unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestNGramFunctions))


..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK


defaultdict(<class 'int'>, {'a': 1, 'only': 1})
defaultdict(<class 'int'>, {'test': 1})
defaultdict(<class 'int'>, {'this': 1})
defaultdict(<class 'int'>, {'is': 1})


<unittest.runner.TextTestResult run=2 errors=0 failures=0>