In [27]:
# global imports
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time
import nltk
import re
from collections import Counter
import pickle
import sys

# relative imports
THIS_DIR = os.getcwd()
sys.path.append(f'{THIS_DIR}/g2p_en')
from g2p import G2p
import expand

In [28]:
# enables use of tensorboard
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## GENERATE SIMPLE RAW TEXT FILE FROM POETRY FILES 

In [29]:
# sets poems subfolder to merge all poems into large text file
POEMS_FOLDER = 'poems/'
POEM_FULL_PATH = os.path.join(THIS_DIR, POEMS_FOLDER)

In [30]:
poem_files = [poem_file for poem_file in os.listdir(POEM_FULL_PATH)]
all_poems_text = ""
for poem_file in poem_files:
    with open(os.path.join(POEM_FULL_PATH, poem_file), 'r') as f:
        all_poems_text += f.read()

## NORMALIZE AND CLEAN DATA

1. Remove rare characters (those that appear less than 5 times),
2. Substitute angled quotes with regular quotes,
3. Convert character data into phonemes (I hypothesize that phoneme data will better represent poetic language).

In [21]:
def clean_text(text, rare_chars):
    '''Helper function that removes angled quotes and rare characters'''
    text = re.sub(r"“", '"', text)
    text = re.sub(r"”", '"', text) 
    text = re.sub(r"‘", "'", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(re.compile("|".join(rare_chars)), "", text)
    return text

In [39]:
BUILD_PHONEMES = True 
if BUILD_PHONEMES:
    rare_chars = Counter(all_poems_text)
    rare_chars = [k for k,v in rare_chars.items() if v<=5]
    all_poems_text = clean_text(all_poems_text, rare_chars)
    # Uses a customized version of g2p that maintains newlines and other important punctuation characters
    g2p = G2p()
    all_poems_phonemes = g2p(all_poems_text[:1000])
    with open('all_poems_phonemes.pickle', 'wb') as f:
        pickle.dump(all_poems_phonemes, f)
else: 
    with open('all_poems_phonemes.pickle', 'rb') as f:
        all_poems_phonemes = pickle.load(f)
        
print(all_poems_text[:1000])
print(all_poems_phonemes[:1000])

# Letter 7
## Michael Palmer
But the buried walls and our mouths of fragments,
 _no us but the snow staring at us_ . . .

And you Mr. Ground-of_what, Mr. Text, Mr. Is-Was,
can you calculate the ratio between wire and window,

between tone and row, copula and carnival
and can you reassemble light from the future-past

in its parabolic nest
or recite an entire winter's words,

its liberties and psuedo-elegies,
the shell of a street-car in mid-turn

or scattered fires in the great hall
I would say not-I here I'd say _The Book of Knots_

I'd say undertows and currents and waterspouts,
streaks of phosphorus and rivervine winds

Dear Z, I'd say it's time, it's nearly time, it's almost, it's
       just about, it's long
past time now time now for the vex- for the vox- for the
       voices of shadows,

time for the prism letters, trinkets and shrouds,
for a whirl in gauzy scarves around the wrecked piazza

Messieurs-Dames, Meine Herren und Damen, our word-ballon,
       you will note, is slow

## CREATE CHAR TO INT MAPS

In [22]:
vocab = sorted(set(all_poems_phonemes))
print(vocab)
print('{} unique characters'.format(len(vocab)))
character_index_map = {c:i for i, c in enumerate(vocab)}
index_character_map = np.array(vocab)
text_as_int_array = np.array([character_index_map[c] for c in all_poems_text])
# Show how the first 13 characters from the text are mapped to integers
print (f'{repr(all_poems_text[:13])} -- mapped to int -- > {text_as_int_array[:13]}')

[' ', '!', '"', '#', "'", ',', '-', '.', '.\n\n\n. . .', '.\n\n   ...', '.\n\n. . .', '.\n\n...', '.\n   .\n   .', '.\n  .', '.\n . . .', '.\n.\n\n.\n.', '.\n. .', '.\n...', '.\n...\n...\n...', '. .', '. .\n.', '. .\n. .', '. . .', '. . .\n\n. . .', '. . .\n   . . .', '. . .  . . .', '. . . .', '. . . . .', '. . . . . . . . . . . .', '. . ..', '..', '...', '...\n\n\n...', '...\n\n...', '...\n...', '...\n...\n...', '?', 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '_', '__', '___']
109 unique characters


KeyError: 'e'

In [19]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(all_poems_text)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int_array)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [9]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 128

# Number of RNN units
rnn_units = 1024

# Number of lstm layers
lstm_layers = 1

# Number of dense layers
dense_layers = 1

In [10]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Dropout

def build_model(vocab_size, embedding_dim, rnn_units, batch_size, lstm_layers):
    model = tf.keras.Sequential()
    model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
    for i in range(lstm_layers):
        model.add(Bidirectional(LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform')))
        model.add(Dropout(0.1))
    tf.keras.layers.Dense(vocab_size)
    return model

In [12]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE, 
  lstm_layers=lstm_layers
)

In [80]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 151) # (batch_size, sequence_length, vocab_size)


In [81]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           38656     
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_1 (Dense)              (64, None, 151)           154775    
Total params: 4,131,735
Trainable params: 4,131,735
Non-trainable params: 0
_________________________________________________________________


In [82]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 151)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       5.01997


In [93]:
model.compile(optimizer='adam', loss=loss)

In [101]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [102]:
EPOCHS=10
log_dir=f"logs/fit/cgru_batch{BATCH_SIZE}_{rnn_units}units_embed{embedding_dim}_{str(time.time()//1)}"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [103]:
%tensorboard --logdir logs/fit
history = model.fit(dataset, epochs=EPOCHS, steps_per_epoch=172, callbacks=[checkpoint_callback, tensorboard_callback])


Train for 172 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 26/172 [===>..........................] - ETA: 4:45 - loss: 1.5969

In [130]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

def generate_text(model, start_string):
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [character_index_map[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Experiment to find the best setting.
  temperature = .5

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index_character_map[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"Shall I compare thee to a summers day\n"))