In [1]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils
import lib.rhyme_utilities as r_utils

In [3]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'
mask_token = '<mask>'

### General Parameters
random_seed = 42
model_folder = '../../../rhymer/v1'
model_name = 'ae_lstm_att_mask_rhymer'

### Model Parameters
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 10
dropout = 0.05
recurrent_dropout = 0.05

In [4]:
os.makedirs(model_folder, exist_ok=True)

In [5]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [6]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token,
                                                                            mask_token = mask_token)

In [7]:
def mask_rhyme_loc(verse, newline_token = '\n', mask_token = '<mask>',
                   rhyme_freq = 2, from_last = True):
    verse = verse.strip(newline_token)
    lines = verse.split(newline_token)
    lines = [re.split(r'\s+', line.strip()) for line in lines]
    n_lines = len(lines)
    
    if from_last:
        line_ind = [x for x in range(n_lines-1, 0, -rhyme_freq)]
    else:
        line_ind = [x for x in range(0, n_lines-1, rhyme_freq)]

    for ind in line_ind:
        lines[ind][-1] = mask_token
    
    lines = [' '.join(line) for line in lines]
    
    return newline_token.join(lines)

In [8]:
test_verse = '<start> <verse> Whenever I am feeling low\nI look around me and I know\nThere\'s a place that will stay within me\nWherever I may choose to go\n'

In [9]:
r_utils.mask_rhyme_loc(test_verse)

"<start> <verse> Whenever I am feeling low\nI look around me and I <mask>\nThere's a place that will stay within me\nWherever I may choose to <mask>"

In [13]:
import pronouncing

In [14]:
pronouncing.search(' '.join(['AO2', 'R'])+'$')

['aancor',
 'accor',
 'acetochlor',
 'albacore',
 'amador',
 'amcor',
 'amcore',
 'americorp',
 'americorps',
 'amplocore',
 'ardmore',
 'backdoor',
 'baikonur',
 'baltimore',
 'bancor',
 'bangalor',
 'bangalore',
 'bangor',
 'barrymore',
 'barrymore',
 'beardmore',
 'becor',
 'bedore',
 'bellcore',
 'bercor',
 'biltmore',
 'bogor',
 'bookstore',
 'bruncor',
 'cantore',
 'carnivore',
 'cencor',
 'centaur',
 'centocor',
 'clarcor',
 'claymore',
 'commodore',
 'damore',
 'delcor',
 'dinosaur',
 'drugstore',
 'dumbledore',
 'ecuador',
 'el-salvador',
 'elcor',
 'elsinore',
 'encor',
 'encore',
 'equicor',
 'evermore',
 'eyesore',
 'fenimore',
 'fennimore',
 'fidelcor',
 'filmore',
 'finamore',
 'folklore',
 'furthermore',
 'gartmore',
 'gencor',
 'gencorp',
 'genencor',
 'glenmore',
 'goldcor',
 'healthcorp',
 'herbivore',
 'herbivore',
 'hors',
 'humidor',
 'hycor',
 'ifor',
 'igor',
 'indoor',
 'isidore',
 'jacor',
 'kenmore',
 'killgore',
 'kishore',
 'labrador',
 'lakeshore',
 'laramo

In [15]:
for i in np.where(r_utils.get_rhyme_ind('shore', vocab_to_index, 2))[0]:
    print(index_to_vocab[i])

['AO1', 'R']
{'explore', 'wore', 'deplore', 'doar', 'hoar', 'rumore', 'rohr', 'decor', 'gabor', 'mazor', 'clore', 'scor', 'bore', 'bohr', 'balthazor', 'or', 'ngor', 'forr', 'faure', 'porr', 'prewar', 'store', 'mor', 'cor', 'vore', 'senor', 'moore', 'swore', 'oar', 'por', 'for', 'offshore', 'baur', 'flor', 'lahore', 'coar', 'anti-war', 'dorr', 'roquemore', 'lore', 'delore', 'outscore', 'melor', 'torre', 'elnore', "d'or", 'pore', 'dohr', 'lohr', 'anymore', 'war', 'inshore', 'hoerr', 'roehr', 'villasenor', 'doerr', 'underinsure', 'shor', 'gore', 'dior', 'stoehr', 'thor', 'mohr', 'woehr', 'postwar', 'glor', 'underscore', 'lamaur', 'saur', 'drawer', 'floor', 'chore', 'outpour', 'abhor', 'nor', 'fore', 'glore', 'longcor', 'yore', 'livor', 'ashore', 'tore', 'before', 'kohr', 'corr', 'laure', 'bensenyore', 'igor', 'sotomayor', 'stohr', 'heretofore', 'ore', 'torr', 'livermore', 'sedor', 'schorr', 'four', 'goar', 'guarantor', 'flore', 'lalor', 'montefiore', 'implore', 'morr', 'lenore', 'soar', '

In [12]:
for i in np.where(r_utils.get_rhyme_ind('singapore', vocab_to_index, 2))[0]:
    print(index_to_vocab[i])

['AO2', 'R']
{'nevermore', 'bellcore', 'genencor', 'amplocore', 'millipore', 'centaur', 'baltimore', 'biltmore', 'clarcor', 'normcore', 'singapore', 'oncor', 'pastore', 'suncor', 'samcor', 'nellcor', 'trovatore', 'matador', 'sofamor', 'kishore', 'elsinore', 'spatafore', 'el-salvador', 'longshore', 'claymore', 'unprofor', 'twenty-four', 'bangalore', 'healthcorp', 'mordor', 'finamore', 'baikonur', 'outdoor', 'bancor', 'syncor', 'acetochlor', 'backdoor', 'folklore', 'pricor', 'evermore', 'filmore', 'stevedore', 'phosphor', 'humidor', 'bedore', 'americorps', 'herbivore', 'ardmore', 'laramore', 'elcor', 'bercor', 'accor', 'rossmore', 'mentor', 'amcor', 'fidelcor', 'nicor', 'zocor', 'carnivore', 'albacore', 'fennimore', 'legore', 'delcor', 'furthermore', 'tenore', 'uproar', 'gencor', 'drugstore', 'bruncor', 'indoor', 'isidore', 'bangor', 'therefore', 'newcor', 'cantore', 'onshore', 'nucor', 'igor', 'theodore', 'ticor', 'parador', 'americorp', 'sycamore', 'talmor', 'barrymore', 'mevacor', 'bo

In [11]:
def get_rhyme_ind(word, vocab_to_index_dict):
    syllable = SyllableTokenizer().tokenize(word)
    indices = [vocab_to_index_dict.get(rhyme) for rhyme in pronouncing.rhymes(syllable[-1])]
    indices = set(indices)
    indices.remove(None)
    vec_oh = np.zeros(len(vocab_to_index_dict))
    for ind in indices:
        vec_oh[ind] = 1
    return vec_oh