In [1]:
import numpy as np
import string

# Set the random seed to ensure reproducibility of random numbers

In [2]:
np.random.seed(1234)

In [3]:
initial = {} # Start of the phrase
first_order = {} # Second word only
second_order = {}

# Define a function to remove punctuation 

In [4]:
def remove_punctuation(s):
    # Use the 'translate' method to remove all punctuation characters from the input string
    # The 'string.punctuation' constant provides a set of all punctuation characters
    return s.translate(str.maketrans('', '', string.punctuation))

# Define a function to add to dictionary

In [5]:
def add2dict(d, k, v):
    # Check if the key 'k' is not already in the dictionary 'd'
    if k not in d:
        # If not, create an empty list as the value associated with key 'k'
        d[k] = []
    # Append the value 'v' to the list associated with key 'k'
    d[k].append(v)

# Iterate over lines in file

In [6]:
for line in open('robert_frost_poe.txt'):
    # Remove punctuation, convert to lowercase, and split the line into tokens
    tokens = remove_punctuation(line.rstrip().lower()).split()
    
    # Get the total number of tokens in the line
    T = len(tokens)
    
    # Iterate over each token in the line
    for i in range(T):
        t = tokens[i]
        if i == 0:
            # If it's the first token in the line, update the 'initial' dictionary
            initial[t] = initial.get(t, 0.) + 1
        else:
            t_1 = tokens[i-1]
            if i == T - 1:
                # If it's the last token in the line, add 'END' to 'second_order' dictionary
                add2dict(second_order, (t_1, t), 'END')
            if i == 1:
                # If it's the second token in the line, update the 'first_order' dictionary
                add2dict(first_order, t_1, t)
            else:
                t_2 = tokens[i-2]
                # For other tokens, update the 'second_order' dictionary
                add2dict(second_order, (t_2, t_1), t)

# Normalize start of phrase counts

In [7]:
initial_total = sum(initial.values())

# Normalize the counts of initial words by dividing each count by the total count
for t, c in initial.items():
    initial[t] = c / initial_total

# Convert list to probability dictionary

In [8]:
def list2pdict(ts):
    # Create an empty dictionary 'd' to store probabilities
    d = {}
    
    # Get the length of the input list 'ts'
    n = len(ts)
    
    # Count the occurrences of each item in the list and store them in 'd'
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    
    # Normalize the counts by dividing each count by the total count 'n'
    for t, c in d.items():
        d[t] = c / n
    
    # Return the probability dictionary 'd'
    return d

# Convert first and second word transitions to probabilities

In [9]:
# Iterate over the items in the 'first_order' dictionary
for t_1, ts in first_order.items():
    # Convert the list of items 'ts' into a probability dictionary
    first_order[t_1] = list2pdict(ts)
    
# Iterate over the items in the 'second_order' dictionary
for k, ts in second_order.items():
    # Convert the list of items 'ts' into a probability dictionary
    second_order[k] = list2pdict(ts)

# Sample word from probability dictionary  

In [10]:
# Define a function called 'sample_word' that takes a probability dictionary 'd' as input
def sample_word(d):
    # Generate a random probability 'p0' between 0 and 1
    p0 = np.random.random()
    
    # Initialize a cumulative probability variable
    cumulative = 0
    
    # Iterate over items in the probability dictionary
    for t, p in d.items():
        cumulative += p
        # Check if the cumulative probability exceeds 'p0'
        if p0 < cumulative:
            # Return the sampled word 't'
            return t
    
    # If no word is sampled, raise an assertion error (should not happen)
    assert (False)

# Generate sentences  

In [11]:
# Define a function called 'generate' to generate sentences
def generate():
    # Loop to generate 4 sentences
    for i in range(4):
        sentence = []
        
        # Start with a random initial word
        w0 = sample_word(initial)
        sentence.append(w0)
        
        # Choose the next word based on the first-order transition probabilities
        w1 = sample_word(first_order[w0])
        sentence.append(w1)
        
        # Continue adding words until 'END' is encountered in second-order transition
        while True:
            w2 = sample_word(second_order[(w0, w1)])
            if w2 == 'END':
                break
            
            # Append the word to the sentence and update 'w0' and 'w1'
            sentence.append(w2)
            w0 = w1
            w1 = w2
        
        # Print the generated sentence
        print(' '.join(sentence))


In [12]:
generate()

i went to bed alone and left me
might just as empty
but it isnt as if and thats not all the money goes so fast
you couldnt call it living for it aint
