In [92]:
import os
import numpy as np
import re
import nltk
import random
import pronouncing
from nltk.corpus import cmudict

from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    visualize_sparsities,
    animate_emission
)

A function from set 6 to parse the observations:

In [6]:
def parse_observations(text):
    # Convert text to dataset.
    lines = [line.split() for line in text.split('\n') if line.split()]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
            word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

First, load in the data and preprocess it. We want to create a dictionary for rhymes and a dictionary for which stress the word starts with.

In [7]:
text = open(os.path.join(os.getcwd(), \
                         'Release/data/shakespeare.txt')).read()
obs, obs_map = parse_observations(text)

Use the HMM class from the homework 6 solutions:

Here, we can first get the syllable counts of our text by reading in from our syllable dictionary and saving the counts for number of syllables.

Our syllable dictionary is in the form of {integer index: \[array of possible syllable values]}, where the array stores a second value if there is another number of syllables we can have

In [175]:
# here we can get the syllables

file = open('Release/data/Syllable_dictionary.txt', 'r')

syllable_dict = {}
keys = obs_map.keys()
# print(len(obs_map.keys()))
# i = 0
for line in file:
    l = line.strip().split()
    
    # all our words are saved to the observation dictionary with
    # special characters taken out, so we must use this regular
    # expression to strip out the special characters
    word = re.sub(r'[^\w]', '', l[0]).lower()
    if word in keys:
        l = sorted(l[1:])
        for i in range(len(l)):
            try:
                l[i] = int(l[i])
            except ValueError:
                continue
        syllable_dict[obs_map[word]] = l
        # print(word, l)
        # i += 1

print(syllable_dict[obs_map['adoting']])
print(syllable_dict[obs_map['even']])


[3, 'E2']
[1, 2]


We use the nltk corpus cmudict dataset to get the pronounciations for the words

In [26]:
reference = cmudict.dict()

In [90]:
# get the words that aren't known - just for reference
unknown = []

for key, value in obs_map.items():
    #print(key, value)
    if(key not in reference.keys()):
        unknown.append(key)

print(len(unknown))     

586


In [161]:
# see if our referencing works

print(reference['when'])
print(reference['bullshit'])
print(reference['pianist'])

[['W', 'EH1', 'N'], ['HH', 'W', 'EH1', 'N'], ['W', 'IH1', 'N'], ['HH', 'W', 'IH1', 'N']]
[['B', 'UH1', 'L', 'SH', 'IH2', 'T']]
[['P', 'IY0', 'AE1', 'N', 'AH0', 'S', 'T'], ['P', 'IY0', 'AA1', 'N', 'AH0', 'S', 'T'], ['P', 'IY1', 'AH0', 'N', 'IH0', 'S', 'T']]


The code below goes through all our words, and gets the final stress of the word, by examining the stress pattern and extracting the value of the last stress. In the case where there are multiple stress patterns for the word, and the last stresses are different, the code randomly chooses a pattern. If there are 3 patterns for a word, 2 which end in a stressed syllable, and 1 which ends unstressed, it is more likely to choose the stressed syllable.


Our stress dictionary is in the form {index, stress value} where stress value is 1 for stressed and 0 for unstressed, and is chosen randomly from the possible final stresses we can have

In [182]:
# go though all our words

# if it's in references, get the last syllable with
# an integer value at the end

# if the integer > 0, then it is stressed, if not it
# is unstressed

stress = {}
lastVow = {}
count = 0
# first lets go through all our words
for key, value in obs_map.items():
    # now go through all the ones in our references
    if(key in reference.keys()):
        # if(len(reference[key]) > 1):
        # go through the possible pronounciations, from
        # the end, and get the last vowel

        diff = []
        for pronounce in reference[key]:
            # this is only a "vowel" if the last character
            # is an integer
            # now iterate backwards through the strings in
            # the pronounciation, and get the k
            for i in range(len(pronounce) - 1, -1, -1):
                try:
                    # this will add the entire pronounciation
                    # and integer to our array
                    int(pronounce[i][-1])

                    # get rid of the -1 index if we want to look
                    # at the actual pronounciation
                    
                    # if we need to, we can count syllables here as well
                    # just by adding a count, and commenting out the break
                    # count += 1
                    
                    # there are acutally 2 types of stressed pronounciation
                    # here we don't care which one it is
                    diff.append(int(pronounce[i][-1]) % 2)
                    break
                except ValueError:
                    continue
        lastVow[value] = diff
        # syllables will just have the random choice of our lastVow for now
        stress[value] = random.choice(diff)

    # if we don't know, we'll just set it to random for now
    else:
        count += 1
        lastVow[value] = [0, 1]
        stress[value] = random.choice([0, 1])
# print(count)
# print(len(obs_map))
# print(lastVow[1], stress[1])
# print(stress)

In [184]:
# see how many have different stresses on the last value
count = 0
for key, value in lastVow.items():
    last = value[0]
    for e in value:
        if e != last:
            count += 1
            # print(key, reference[key])
            break
            
print(str(count), 'values with different stress on the last value')

680 values with different stress on the last value


This following code gets the syllables of the words, and adds it to syllable_dict

In [185]:
# here we can get the syllables

file = open('Release/data/Syllable_dictionary.txt', 'r')

syllable_dict = {}
keys = obs_map.keys()
print(len(obs_map.keys()))
i = 0
for line in file:
    l = line.strip().split()
    
    # all our words are saved to the observation dictionary with
    # special characters taken out, so we must use this regular
    # expression to strip out the special characters
    word = re.sub(r'[^\w]', '', l[0]).lower()
    if word in keys:
        syllable_dict[obs_map[word]] = sorted(l[1:])
        i += 1

print(syllable_dict[obs_map['adoting']])


3330
['3', 'E2']


In [181]:
print(stress[obs_map['from']])

1


In [None]:
# 85 known ones have different stress on the last value
# 586 more have unknown stress on the last value
# these numbers include our numbers at the top of each poem (for now)

.. So after writing the above code to generate the stress patterns, I found a the pronouncing library, which uses the cmu data and can apparently do the same thing. We won't be using this library for stress patterns, but we will be for finding rhymes. The import has been added to the top.

In [172]:
# as a test:
print(pronouncing.rhymes('bullshit'))

['accredit', 'armpit', 'baby-sit', 'beckwitt', 'cockpit', 'counterfeit', 'counterfeit', 'dipshit', 'drillbit', 'goldschmidt', 'hammerschmidt', 'horgavitt', 'horseshit', 'hypocrite', 'identikit', 'kleinschmidt', 'kuhlenschmidt', 'mandalit', 'messerschmidt', 'messerschmitt', 'misfit', 'moonlit', 'outfit', 'outwit', 'permit', 'pettit', 'pettitt', 'pettitte', 'proudfit', 'retrofit', 'rootkit', 'sunlit', 'tanartkit', 'telit', 'tidbit', 'waffenschmidt', 'waldschmidt', 'wolfenschmidt']


In [173]:
# lets limit our rhyming to our data set
def getRhyme(word):
    if word in reference.keys():
        rhymes = pronouncing.rhymes(word)
        # we can just take a set different here
        return sorted(list(set(rhymes) - (set(rhymes) - set(obs_map.keys()))))


In [174]:
# now let's test this
getRhyme('bullshit')

['counterfeit', 'permit']