# Data Preparation 📓✏️

> Before starting, define the **n-grams** to use.


In [64]:
n_grams = 2

In [65]:
# loading shakespeare's works

import nltk
nltk.corpus.gutenberg.fileids()

caesar = nltk.corpus.gutenberg.words('shakespeare-caesar.txt')
print("Caesar length: ", len(caesar))
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
print("Hamlet length: ", len(hamlet))
macbeth = nltk.corpus.gutenberg.words('shakespeare-macbeth.txt')
print("Macbeth length: ", len(macbeth))

Caesar length:  25833
Hamlet length:  37360
Macbeth length:  23140


> Lowering the case of the text and removing punctuation


In [66]:
import string

# lowercase and remove punctuation
def preprocess_text(text):
    # Convert to lowercase
    text = [word.lower() for word in text]
    # Remove punctuation
    text = [word for word in text if word not in string.punctuation]
    return text

# Preprocess texts
caesar_clean = preprocess_text(caesar)
hamlet_clean = preprocess_text(hamlet)
macbeth_clean = preprocess_text(macbeth)

print(caesar_clean[:10])
print(hamlet_clean[:10])
print(macbeth_clean[:10])

['the', 'tragedie', 'of', 'julius', 'caesar', 'by', 'william', 'shakespeare', '1599', 'actus']
['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare', '1599', 'actus', 'primus']
['the', 'tragedie', 'of', 'macbeth', 'by', 'william', 'shakespeare', '1603', 'actus', 'primus']


> Creating the **list of n-grams** for each text


In [67]:
# list of n-grams for each text
def get_ngrams(tokens, n):
    return list(nltk.ngrams(tokens, n))

# Get n-grams for each text
caesar_ngrams = get_ngrams(caesar_clean, n_grams)
hamlet_ngrams = get_ngrams(hamlet_clean, n_grams)
macbeth_ngrams = get_ngrams(macbeth_clean, n_grams)

print(f"\n{n_grams}-grams from Caesar:")
print(caesar_ngrams[:5])
print(f"\n{n_grams}-grams from Hamlet:")
print(hamlet_ngrams[:5])
print(f"\n{n_grams}-grams from Macbeth:")
print(macbeth_ngrams[:5])


2-grams from Caesar:
[('the', 'tragedie'), ('tragedie', 'of'), ('of', 'julius'), ('julius', 'caesar'), ('caesar', 'by')]

2-grams from Hamlet:
[('the', 'tragedie'), ('tragedie', 'of'), ('of', 'hamlet'), ('hamlet', 'by'), ('by', 'william')]

2-grams from Macbeth:
[('the', 'tragedie'), ('tragedie', 'of'), ('of', 'macbeth'), ('macbeth', 'by'), ('by', 'william')]


> Counting the **frequency** of each ngram's subsequent token and the own ngram's frequency


In [68]:
# Count the frequency of each n-gram's subsequent token
# Example: from_ngram_to_next_token_counts[('to', 'be')] = {'count': 15, 'next_tokens': {'or': 10, 'not': 5}}
# This means that the n-gram 'to be' is followed by 'or' 10 times and by 'not' 5 times, and the total count of ngram 'to be' is 15


from_ngram_to_next_token_counts = {}

def count_next_token(ngrams, tokens, dictionary):
    for i in range(len(ngrams)):
        if i < len(tokens) - 2:  # Make sure we have a next token
            ngram = ngrams[i]
            next_token = tokens[i + 2]  # Get the token that follows the ngram
            
            # If ngram doesn't exist, create new entry with count and next_tokens dictionary
            if ngram not in dictionary:
                dictionary[ngram] = {
                    'count': 0,
                    'next_tokens': {}
                }
            
            # Increment total count for this ngram
            dictionary[ngram]['count'] += 1
            
            # Add or increment next token count
            if next_token not in dictionary[ngram]['next_tokens']:
                dictionary[ngram]['next_tokens'][next_token] = 1
            else:
                dictionary[ngram]['next_tokens'][next_token] += 1

# Count next tokens from each play
count_next_token(caesar_ngrams, caesar_clean, from_ngram_to_next_token_counts)
count_next_token(hamlet_ngrams, hamlet_clean, from_ngram_to_next_token_counts)
count_next_token(macbeth_ngrams, macbeth_clean, from_ngram_to_next_token_counts)


ngram = ('the', 'tragedie')
result = from_ngram_to_next_token_counts[ngram]
print(f"N-gram '{ngram[0]} {ngram[1]}':")
print(f"Total count: {result['count']}")
print(f"Next tokens: {result['next_tokens']}")

print("\nRaw data:")
print(from_ngram_to_next_token_counts[('the', 'tragedie')])

N-gram 'the tragedie':
Total count: 6
Next tokens: {'of': 6}

Raw data:
{'count': 6, 'next_tokens': {'of': 6}}


# Probability Distribution 📊

> Calculating the **probability** of each n-gram's subsequent token

In [69]:
# Calculating the probability of each ngram's subsequent token

from_ngram_to_next_token_probs = {}

def calculate_probabilities(counts_dict):

    probs_dict = {}
    
    # For each ngram in the counts dictionary
    for ngram, data in counts_dict.items():
        total_count = data['count']
        next_tokens = data['next_tokens']
        
        # Calculate probability for each next token
        probs = {}
        for token, count in next_tokens.items():
            prob = count / total_count
            probs[token] = round(prob, 3)  # Round to 3 decimal places
            
        probs_dict[ngram] = probs
    
    return probs_dict

# Calculate probabilities
from_ngram_to_next_token_probs = calculate_probabilities(from_ngram_to_next_token_counts)

# Example probabilities for the same ngram we checked before
ngram = ('the', 'tragedie')
print(f"Probabilities for ngram '{ngram[0]} {ngram[1]}':")
print(from_ngram_to_next_token_probs[ngram])

print("\nMore examples:")
for ngram, probs in list(from_ngram_to_next_token_probs.items())[:5]:
    print(f"\nngram '{ngram[0]} {ngram[1]}':")
    print(f"Probabilities: {probs}")


Probabilities for ngram 'the tragedie':
{'of': 1.0}

More examples:

ngram 'the tragedie':
Probabilities: {'of': 1.0}

ngram 'tragedie of':
Probabilities: {'julius': 0.167, 'ivlivs': 0.167, 'hamlet': 0.333, 'macbeth': 0.333}

ngram 'of julius':
Probabilities: {'caesar': 1.0}

ngram 'julius caesar':
Probabilities: {'by': 1.0}

ngram 'caesar by':
Probabilities: {'william': 1.0}


# Sampling Next Token 🎲


> Sampling the next token **based on the probability distribution** for a given ngram.


In [70]:
import numpy as np

def sample_next_token(ngram, prob_dict):
    """
    Sample the next token based on the probability distribution for a given ngram.
    
    Args:
        ngram (tuple): A tuple of tokens (previous tokens)
        prob_dict (dict): Dictionary containing probability distributions for ngrams
    
    Returns:
        str: The sampled next token
        If ngram not found, returns None
    """
    # Check if ngram exists in our probability dictionary
    if ngram not in prob_dict:
        return None
    
    # Get probability distribution for this ngram
    next_token_probs = prob_dict[ngram]
    
    # Get tokens and their probabilities as separate lists
    tokens = list(next_token_probs.keys())
    probs = list(next_token_probs.values())
    
    # Normalize probabilities to ensure they sum to 1
    probs_sum = sum(probs)
    if probs_sum > 0:
        probs = [p/probs_sum for p in probs]
    
    # Sample one token based on the probability distribution
    next_token = np.random.choice(tokens, p=probs)
    
    return next_token

# Test the sampling function
test_ngram = ('tragedie', 'of')
print(f"Testing sampling for ngram '{test_ngram[0]} {test_ngram[1]}'")
print(f"Probability distribution: {from_ngram_to_next_token_probs[test_ngram]}")

# Sample multiple times to see the distribution
n_samples = 1000
samples = [sample_next_token(test_ngram, from_ngram_to_next_token_probs) for _ in range(n_samples)]

# Calculate and print the empirical distribution
unique_tokens = set(samples)
empirical_dist = {token: samples.count(token)/n_samples for token in unique_tokens}

print(f"\nEmpirical distribution after {n_samples} samples:")
for token, prob in empirical_dist.items():
    print(f"'{token}': {prob:.3f}")

Testing sampling for ngram 'tragedie of'
Probability distribution: {'julius': 0.167, 'ivlivs': 0.167, 'hamlet': 0.333, 'macbeth': 0.333}

Empirical distribution after 1000 samples:
'hamlet': 0.340
'julius': 0.186
'ivlivs': 0.162
'macbeth': 0.312


# Generating Text 📝


> Generating text starting from an n-gram and a specified amount of words

In [49]:
def generate_text_from_ngram(initial_ngram, num_words, prob_dict):
    """
    Generate text starting from an initial n-gram.
    
    Args:
        initial_ngram (tuple): The starting n-gram
        num_words (int): Number of words to generate
        prob_dict (dict): Dictionary containing probability distributions
    
    Returns:
        str: Generated text
    """
    # Initialize text with the initial n-gram
    generated_words = list(initial_ngram)
    
    # Generate remaining words
    current_ngram = initial_ngram
    for _ in range(num_words - len(initial_ngram)):
        # Sample next token
        next_token = sample_next_token(current_ngram, prob_dict)
        
        # If we can't continue (no following tokens found), break
        if next_token is None:
            break
            
        # Add the new token to our generated text
        generated_words.append(next_token)
        
        # Create new n-gram for next iteration
        current_ngram = tuple(generated_words[-n_grams:])
    
    # Join all words with spaces
    return ' '.join(generated_words)

# Test the text generation with different initial n-grams
test_cases = [
    ('the', 'tragedie'),
    ('to', 'be'),
    ('william', 'shakespeare')
]

print("Generated Text Examples:\n")
for initial_ngram in test_cases:
    print(f"Starting with '{initial_ngram[0]} {initial_ngram[1]}':")
    generated_text = generate_text_from_ngram(initial_ngram, 20, from_ngram_to_next_token_probs)
    print(f"{generated_text}\n")

Generated Text Examples:

Starting with 'the tragedie':
the tragedie of macbeth marry he was likely had he his hurts before rosse i and truly you were not

Starting with 'to be':
to be sounded more then all the gods to day decius neuer feare that if againe this apparition come he

Starting with 'william shakespeare':
william shakespeare 1603 actus primus scoena prima thunder and lightning enter caska and those that should be prickt to dye



# Results 🏁

## **2-grams**

Generated text examples with 20 words:

Starting with 'the tragedie':
>- the tragedie of macbeth marry he was likely had he his hurts before rosse i and truly you were not

Starting with 'to be':
>- to be sounded more then all the gods to day decius neuer feare that if againe this apparition come he

Starting with 'william shakespeare':
>- william shakespeare 1603 actus primus scoena prima thunder and lightning enter caska and those that should be prickt to dye

