# Install and Import Needed Packages and Models

First we make sure we have installed some basic things - json, numpy, sys.

In [2]:
import json
from random import sample
import sys

In [3]:
try:
    import ipywidgets as widgets
except:
    !pip install ipywidgets
    import ipywidgets as widgets

Second, we make sure we have installed spacy (which will split our input into words, assign a part of speech tag to each word and if necessary help us with named entity recognition), as well as python packages for:
* syllabification - currently, we import SyllableTokenizer from nltk, and syllabifier which uses the CMU pronunciation dictionary
* inflection - currently, we import lemminflect
* rhyming - currently, we import SoundsLike
and the nltk module for wordnet, to allow us to find words and calculate word distances.

In [4]:
# see https://spacy.io/usage/spacy-101
try:
    import spacy
except:
    !pip install spacy==3.2.4
    import spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
# see https://www.nltk.org/howto/wordnet.html
# see https://www.nltk.org/api/nltk.tokenize.sonority_sequencing.html
try:
    from nltk.corpus import wordnet
    from nltk.tokenize import SyllableTokenizer
    !python -m nltk.downloader wordnet
    !python -m nltk.downloader omw-1.4
except:
    !pip install nltk
    !python -m nltk.downloader wordnet
    !python -m nltk.downloader omw-1.4
    from nltk.corpus import wordnet
    from nltk.tokenize import SyllableTokenizer

[nltk_data] Downloading package wordnet to /Users/riley/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/riley/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
!git clone https://github.com/anson-vandoren/syllabifier.git
sys.path.insert(0, 'syllabifier')
from syllabifier import cmuparser3
from syllabifier.syllable3 import generate_syllables

fatal: destination path 'syllabifier' already exists and is not an empty directory.


In [7]:
# see https://github.com/bjascob/LemmInflect
try:
    from lemminflect import getInflection
except:
    !pip install lemminflect
    from lemminflect import getInflection

In [8]:
try:
    from SoundsLike.SoundsLike import Search
except:
    !pip install SoundsLike
    from SoundsLike.SoundsLike import Search

## Let's Experiment With Different Syllabifiers

Generally, the CMU dictionary-based one is more accurate. However, when it doesn't syllabify it doesn't syllabify *at all*. The other one always gives you something (albeit often incorrect).

In [9]:
class syllabify:
    def __init__(self, use_cmu = True):
        if use_cmu:
            self.syllabifier = cmuparser3.CMUDictionary()
        else:
            self.syllabifier = SyllableTokenizer()
        self.use_cmu = use_cmu
        
    def syllabify(self, term):
        if self.use_cmu:
            phoneme_str = self.syllabifier.get_first(term)
            if phoneme_str:
                return [str(x) for x in generate_syllables(phoneme_str)]
        else:
            return self.syllabifier.tokenize(term)

# Process a Poem

We take in a poem as a multi-line string. We split it on newlines so we can handle rhyming. We tokenize and part of speech tag. 

Then we choose 1 out of n "content" words (noun excluding proper nouns, verb, adjective, adverb) at random and replace it with another word having the same part of speech and number of syllables. If the chosen word is at the end of a line we try to ensure it rhymes.

Future work:
* ngrams / multiword expressions

In [10]:
class poem_replacer:
    def __init__(self, lexicon_file, use_cmu = True, use_pos = True, use_anagrams = False, use_rhyme = True, use_syllables = True, union_or_intersection = 'union', min_wn_distance = 0, max_wn_distance = -1):
        self.nlp = spacy.load("en_core_web_md")
        with open(lexicon_file) as f:
            self.lexicon = json.load(f)
        self.syllabifier = syllabify(use_cmu)
        self.pos_mappings =  {'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV}
        self.use_cmu = use_cmu
        self.use_pos = use_pos
        self.use_anagrams = use_anagrams
        self.use_rhyme = use_rhyme
        self.use_syllables = use_syllables
        self.union_or_intersection = union_or_intersection
        self.min_wn_distance = min_wn_distance
        self.max_wn_distance = max_wn_distance

    def get_candidates_by_pos(self, term, pos):
        if pos in self.lexicon['by_pos']:
            possibles = self.lexicon['by_pos'][pos]
            if possibles:
                return [x for x in possibles if x != term]
        return []
    
    def get_candidates_by_syllables(self, term):
        syllables = self.syllabifier.syllabify(term)
        if str(len(syllables)) in self.lexicon['by_syllables']:
            possibles = self.lexicon['by_syllables'][str(len(syllables))]
            if possibles:
                return [x for x in possibles if x != term]
        return []
        
    def get_candidates_by_anagrams(self, term):
        if ''.join(sorted(term)) in self.lexicon['by_anagrams']:
            possibles = self.lexicon['by_anagrams'][''.join(sorted(term))]
            if possibles:
                return [x for x in possibles if x != term]
        return []
    
    def get_candidates_by_rhymes(self, term):
        possibles = []
        if term in self.lexicon['by_rhymes']:
            possibles = self.lexicon['by_rhymes'][term]
        syllables = self.syllabifier.syllabify(term)
        if len(syllables) > 0 and syllables[-1] in self.lexicon['by_last_syllables']:
            possibles = list(set(possibles).union(self.lexicon['by_last_syllables'][syllables[-1]]))
        if possibles:
            return [x for x in possibles if x != term]
        return []
        
    def wn_distance(self, synset, possible, pos):
        possible_synsets = wordnet.synsets(possible, self.pos_mappings[pos])
        if len(possible_synsets) > 0:
            dist = synset.wup_similarity(possible_synsets[0], simulate_root=False)
            if dist is not None:
                return dist
        return float('inf')

    def get_candidates(self, token, end_of_line = False):
        pos_possibles = []
        syllables_possibles = []
        anagrams_possibles = []
        rhyme_possibles = []
        if self.use_pos:
            pos_possibles = self.get_candidates_by_pos(token.text, token.pos_)
        if self.use_syllables:
            syllables_possibles = self.get_candidates_by_syllables(token.text)
        if self.use_anagrams:
            anagrams_possibles = self.get_candidates_by_anagrams(token.text)
        if self.use_rhyme and end_of_line:
            rhyme_possibles = self.get_candidates_by_rhymes(token.text)
        if self.union_or_intersection == 'union':
            possibles = list(set(pos_possibles).union(syllables_possibles, anagrams_possibles, rhyme_possibles))
        else:
            possibles_all = [x for x in [pos_possibles, syllables_possibles, anagrams_possibles, rhyme_possibles] if len(x) > 0]
            if len(possibles_all) > 0:
                possibles = possibles_all[0]
                for possible in possibles_all[1:]:
                    possibles = list(set(possibles).intersection(possible))
            else:
                possibles = []
        # if min_wn_distance or max_wn_distance > -1, then we reduce candidates to those within min and max Wordnet distance of the token
        # there are multiple Wordnet distance algorithms; we just use wup for now; see https://www.nltk.org/howto/wordnet.html
        if self.min_wn_distance > 0:
            t = wordnet.synsets(token.text, self.pos_mappings[token.pos_])
            if len(t) > 0:
                if self.max_wn_distance > -1:
                    possibles = [possible for possible in possibles if len(wordnet.synsets(possible, self.pos_mappings[token.pos_])) > 0 and self.wn_distance(t[0], possible, token.pos_) in range(self.min_wn_distance, self.max_wn_distance)]
                else:
                    possibles = [possible for possible in possibles if len(wordnet.synsets(possible, self.pos_mappings[token.pos_])) > 0 and self.wn_distance(t[0], possible, token.pos_) > self.min_wn_distance]
        return possibles                                 
        
    def get_tokens(self, text):
        # make a spacy document from the input text
        doc = self.nlp(text)
        # get all the tokens
        # see https://spacy.io/api/token
        # see https://universaldependencies.org/u/pos/index.html
        tokens = [token for token in doc]
        content_tokens = [token for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']]
        return tokens, content_tokens               
    
    def process(self, text, percent_to_replace):                                 
        tokens, content_tokens = self.get_tokens(text)
        number_to_replace = int(len(content_tokens) * (percent_to_replace / 100))
        # choose number_to_replace tokens to replace
        to_replace = sample(content_tokens, number_to_replace)
        # this contains the output words
        words = []
        # for each token
        for i, token in enumerate(tokens):
            # if the token is to be replaced
            if token in to_replace:
                # figure out if it is at the end of the poem or the end of a line; you can use this if you want to only match rhyme at end of line
                end_of_line = (i == len(tokens)-1 or tokens[i+1].pos_ == 'SPACE')
                # get possible replacement terms
                possibles = self.get_candidates(token, end_of_line = end_of_line)
                # choose one of them at random and replace!
                if len(possibles) > 0:
                    chosen = sample(possibles, 1)
                    words.append("#REPLACE_TOKEN" + chosen[0] + token.whitespace_)
                else:
                    words.append(token.text + token.whitespace_)
            else:
                words.append(token.text + token.whitespace_)
        return number_to_replace, [x.text for x in to_replace], words     

### Let's Try It Out!

In [15]:
text = '''You who in scattered rhymes listen to the sound 
Of those sighs with which I fed the heart
During that first youthful mistake of mine
When I was in part a different man than I am now
I hope I can find forgiveness and pithy 
For the diverse style in which I cry and reason,
Between useless hope and useless pain
From those who understand love out of experience.
Now I can see clearly how I have been a joke
To all the people, for a long time, so much so that when I think about it,I often feel ashamed of myself; 
Shame is the result of my ramblings,
Along with regret, and the clear realization
That what the world likes is but a brief dream'''

# PR = poem_replacer('lexicon.json')
# _, to_replace, words = PR.process(text, 10)
# print('plain:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = True, use_anagrams = False, use_rhyme = False, use_syllables = False, union_or_intersection = 'intersection')
# _, to_replace, words = PR.process(text, 10)
# print('pos:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = False, use_anagrams = True, use_rhyme = False, use_syllables = False, union_or_intersection = 'intersection')
# _, to_replace, words = PR.process(text, 10)
# print('anagrams:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = False, use_anagrams = False, use_rhyme = True, use_syllables = False, union_or_intersection = 'intersection')
# _, to_replace, words = PR.process(text, 10)
# print('rhyme:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = False, use_anagrams = False, use_rhyme = False, use_syllables = True, union_or_intersection = 'intersection')
# _, to_replace, words = PR.process(text, 10)
# print('syllables:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = True, use_anagrams = True, use_rhyme = False, use_syllables = False, union_or_intersection = 'intersection')
# _, to_replace, words = PR.process(text, 10)
# print('pos, anagrams, intersection:', ', '.join(to_replace), ''.join(words))

# PR = poem_replacer('lexicon.json', use_pos = True, use_anagrams = True, use_rhyme = False, use_syllables = False, union_or_intersection = 'union')
# _, to_replace, words = PR.process(text, 10)
# print('pos, anagrams, union:', ', '.join(to_replace), ''.join(words))

PR = poem_replacer('lexicon.json', min_wn_distance=1, max_wn_distance=5)
_, to_replace, words = PR.process(text, 10)
print('wn_distances:', ', '.join(to_replace), ''.join(words))


wn_distances: part, understand, feel, style, useless You who in scattered rhymes listen to the sound 
Of those sighs with which I fed the heart
During that first youthful mistake of mine
When I was in #REPLACE_TOKENportions a different man than I am now
I hope I can find forgiveness and pithy 
For the diverse #REPLACE_TOKENway in which I cry and reason,
Between #REPLACE_TOKENuselessest hope and useless pain
From those who #REPLACE_TOKENunderstanding love out of experience.
Now I can see clearly how I have been a joke
To all the people, for a long time, so much so that when I think about it,I often #REPLACE_TOKENfeeling ashamed of myself; 
Shame is the result of my ramblings,
Along with regret, and the clear realization
That what the world likes is but a brief dream


# Dashboard

In [12]:
html = widgets.HTML(
    value="<H2>Poem Manipulator</H2>Enter your poem in the text area, select options beneath, choose number of substitutions, then click Manipulate!")
log = widgets.Output(layout={'border': '1px solid black', 'height': '40%', 'width': '97%'})
with log:
    print("Log notes will appear here")
header_box = widgets.VBox([html, log])

input_poem = widgets.Textarea(value='Enter poem here', placeholder='Enter poem here', layout={'border': '1px solid black', 'height': '100%', 'width': '95%'})

layout = widgets.Layout(width='auto', height='auto')
substitution_slider = widgets.IntSlider(min=0, max=100, value=10)
substitution_label = widgets.Label('Percent to replace: ', layout=widgets.Layout(width='40%'))
substitution_box = widgets.HBox([substitution_label, substitution_slider])

generate_button =  widgets.Button(description='Manipulate!', disabled=False)

min_wn_slider = widgets.IntSlider(min=-1, max=10, value=0)
min_wn_label = widgets.Label('Min WN distance: ', layout=widgets.Layout(width='40%'))
min_wn_box = widgets.HBox([min_wn_label, min_wn_slider])

max_wn_slider = widgets.IntSlider(min=-1, max=10, value=-1)
max_wn_label = widgets.Label('Max WN distance: ', layout=widgets.Layout(width='40%'))
max_wn_box = widgets.HBox([max_wn_label, max_wn_slider])

pos = widgets.Checkbox(True, layout=widgets.Layout(justify_content="flex-start"))
pos_label = widgets.Label('Use Part of Speech: ', layout=widgets.Layout(width='40%'))
pos_box = widgets.HBox([pos_label, pos])

rhyme = widgets.Checkbox(True)
rhyme_label = widgets.Label('Use Rhyme: ', layout=widgets.Layout(width='40%'))
rhyme_box = widgets.HBox([rhyme_label, rhyme])

anagrams = widgets.Checkbox(False)
anagrams_label = widgets.Label('Use Anagrams: ', layout=widgets.Layout(width='40%'))
anagrams_box = widgets.HBox([anagrams_label, anagrams])

syllables = widgets.Checkbox(True)
syllables_label = widgets.Label('Use Syllables: ', layout=widgets.Layout(width='40%'))
syllables_box = widgets.HBox([syllables_label, syllables])

uoi = widgets.RadioButtons(value='union', options=['union', 'intersection'])
uoi_label = widgets.Label('Combine possibles by: ', layout=widgets.Layout(width='40%'))
uoi_box = widgets.HBox([uoi_label, uoi])

left_box = widgets.VBox([substitution_box, generate_button], layout=widgets.Layout(width='80%', height='auto'))
right_box = widgets.VBox([min_wn_box, max_wn_box, pos_box, syllables_box, rhyme_box, anagrams_box, uoi_box], layout=widgets.Layout(width='80%', height='auto'))
box = widgets.HBox([left_box, right_box])

out = widgets.Output(layout={'border': '1px solid black', 'height': '100%', 'width': '95%'})
with out:
    print("Output poem will appear here")
    
app = widgets.AppLayout(header=header_box, left_sidebar=input_poem, footer=box, center=None, right_sidebar=out)


def process(b):
    PR = poem_replacer('lexicon.json', use_cmu=True, use_pos=pos.value, use_anagrams=anagrams.value, use_syllables=syllables.value, use_rhyme=rhyme.value, union_or_intersection=uoi.value, min_wn_distance=min_wn_slider.value, max_wn_distance=max_wn_slider.value)
    number_to_replace, to_replace, words = PR.process(input_poem.value, substitution_slider.value)
    log.clear_output()
    log.append_stdout(number_to_replace)
    log.append_stdout('tokens to be replaced:' + ', '.join(to_replace))
    out.clear_output()
    out.append_stdout(''.join(words))

generate_button.on_click(process)

app

AppLayout(children=(VBox(children=(HTML(value='<H2>Poem Manipulator</H2>Enter your poem in the text area, sele…

# Get a Lexicon

**You should only have to do this section ONCE. It is slow.**

We need a word list. For now, we use WordNet to get a word list. So from WordNet we extract all nouns, verbs, adjectives and adverbs. For each extracted term we collect all inflected forms.

We then construct lexicons: one by part of speech, one by number of syllables, one by anagrams, and one by final syllable:
* In order to handle replacement that respects syntax we need a lexicon by part of speech.
* In order to handle replacement that respects rhythm we need a lexicon by number of syllables.
* In order to handle replacement by anagrams we need a lexicon by anagrams.
* In order to handle rhyming, we use the last syllable of each word; we also use the CMU pronunciation dictionary to find homophones.

In [24]:
class wordnet_lexicon:
    def __init__(self):
        pass
        
    # Collect our lexicon by part of speech
    def collect_terms(self, use_cmu = True):
        # see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        types = {'NOUN': ['NN', 'NNS'], 'VERB': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], 'ADJ': ['JJ', 'JJR', 'JJS'], 'ADV': ['RB', 'RBR', 'RBS']}
        syllabifier = syllabify(use_cmu)
        terms = {}
        # Wordnet only contains these classes of word
        for (tag, typ) in [(wordnet.NOUN, 'NOUN'), (wordnet.VERB, 'VERB'), (wordnet.ADJ, 'ADJ'), (wordnet.ADV, 'ADV')]:
            print("Processing", typ)
            for synset in list(wordnet.all_synsets(tag)):
                for lemma in synset.lemmas():
                    # consider in future: keep multi-word expressions
                    if '_' not in lemma.name():
                        if lemma.name() not in terms:
                            terms[lemma.name()] = {}
                        for form in types[typ]:
                            try:
                                inflected = getInflection(lemma.name(), tag=form)
                                if len(inflected) > 0 and inflected[0] not in terms[lemma.name()]:
                                    try:
                                        # could also use closeHomophones; may be slower
                                        rhymes = Search.perfectHomophones(inflected[0])
                                    except Exception as e:
                                        rhymes = []
                                    terms[lemma.name()][inflected[0]] = {'term': inflected[0], 'lemma': lemma.name(), 'pos': typ, 'syllables': syllabifier.syllabify(inflected[0]), 'letters': ''.join(sorted(inflected[0])), 'rhymes': rhymes}
                            except Exception as e:
                                print("error", lemma.name(), form, e)
        return terms #

    # Collect terms; construct lexicons - one by part of speech, one by number of syllables, one by anagrams, and one by final syllable; dump to a file
    def create_lexicon(self, lexicon_file, use_cmu = True):
        terms = self.collect_terms(use_cmu = use_cmu)
        lexicons = {'terms': terms, 'by_pos': {}, 'by_syllables': {}, 'by_last_syllables': {}, 'by_anagrams': {}, 'by_rhymes': {}}
        for lemma in terms:
            for entry in terms[lemma].values():
                # add it to the part of speech list
                if entry['pos'] not in lexicons['by_pos']:
                    lexicons['by_pos'][entry['pos']] = []
                lexicons['by_pos'][entry['pos']].append(entry['term'])
                if entry['syllables'] is not None:
                    # add it to the syllables list
                    if len(entry['syllables']) not in lexicons['by_syllables']:
                        lexicons['by_syllables'][len(entry['syllables'])] = []
                    lexicons['by_syllables'][len(entry['syllables'])].append(entry['term'])
                    # add it to the last-syllables list
                    if entry['syllables'][-1] not in lexicons['by_last_syllables']:
                        lexicons['by_last_syllables'][entry['syllables'][-1]] = []
                    lexicons['by_last_syllables'][entry['syllables'][-1]].append(entry['term'])
                # add it to the anagrams list
                if entry['letters'] not in lexicons['by_anagrams']:
                    lexicons['by_anagrams'][entry['letters']] = []
                lexicons['by_anagrams'][entry['letters']].append(entry['term'])
                # add it to the rhymes list
                for rhyme in entry['rhymes']:
                    if rhyme not in lexicons['by_rhymes']:
                        lexicons['by_rhymes'][rhyme] = []
                    lexicons['by_rhymes'][rhyme].append(entry['term'])                                    
        with open(lexicon_file, 'w') as f:
            json.dump(lexicons, f)

In [20]:
%%time
WL = wordnet_lexicon()
WL.create_lexicon('lexicon.json')

Processing NOUN
error F NNS list index out of range
error f NNS list index out of range
error F NNS list index out of range
error F NNS list index out of range
error F NNS list index out of range
error Th NNS list index out of range
error Th NNS list index out of range
Processing VERB
Processing ADJ
Processing ADV
CPU times: user 1h 25min 45s, sys: 5min 7s, total: 1h 30min 53s
Wall time: 9min 49s
