In [1]:
# global imports
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time
import nltk
import re
from collections import Counter
import pickle
import sys

# relative imports
THIS_DIR = os.getcwd()
sys.path.append(f'{THIS_DIR}/g2p_en')
from g2p import G2p
import expand

In [2]:
# enables use of tensorboard
%load_ext tensorboard

## GENERATE SIMPLE RAW TEXT FILE FROM POETRY FILES 

In [3]:
# sets poems subfolder to merge all poems into large text file
POEMS_FOLDER = 'poems/'
POEM_FULL_PATH = os.path.join(THIS_DIR, POEMS_FOLDER)

In [4]:
poem_files = [poem_file for poem_file in os.listdir(POEM_FULL_PATH)]
all_poems = []
all_poems_text = ""
for poem_file in poem_files:
    with open(os.path.join(POEM_FULL_PATH, poem_file), 'r') as f:
        all_poems.append("\n".join(f.readlines()[2:]))
        all_poems_text+=f.read()

## NORMALIZE AND CLEAN DATA

1. Remove rare characters (those that appear less than 5 times),
2. Substitute angled quotes with regular quotes,
3. Convert character data into phonemes (I hypothesize that phoneme data will better represent poetic language).

In [5]:
def clean_text(text, rare_chars):
    '''Helper function that removes angled quotes and rare characters'''
    text = re.sub(r"“", '"', text)
    text = re.sub(r"”", '"', text) 
    text = re.sub(r"‘", "'", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(re.compile("|".join(rare_chars)), "", text)
    text = re.sub(r"\n", "~", text)
    return text

In [6]:
BUILD_PHONEMES = False 
if BUILD_PHONEMES:
    rare_chars = Counter(all_poems_text)
    rare_chars = [k for k,v in rare_chars.items() if v<=5]
    all_poems_text = clean_text(all_poems_text, rare_chars)
    # Uses a customized version of g2p that maintains newlines and other important punctuation characters
    g2p = G2p()
    all_poems_phonemes = g2p(all_poems_text)
    with open('all_poems_phonemes.pickle', 'wb') as f:
        pickle.dump(all_poems_phonemes, f)
    with open('phoneme_word_dict.pickle', 'wb') as f:
        pickle.dump(g2p.word_map, f)
else: 
    with open('all_poems_phonemes.pickle', 'rb') as f:
        all_poems_phonemes = pickle.load(f)
    with open('phoneme_word_dict.pickle', 'rb') as f:
        phoneme_word_dict = pickle.load(f)

## CREATE CHAR TO INT MAPS

In [7]:
vocab = sorted(set(all_poems_phonemes))
print(vocab)
print('{} unique characters'.format(len(vocab)))
character_index_map = {c:i for i, c in enumerate(vocab)}
index_character_map = np.array(vocab)
text_as_int_array = np.array([character_index_map[c] for c in all_poems_phonemes])

# Show how the first 13 characters from the text are mapped to integers
print (f'{repr(all_poems_phonemes[:13])} -- mapped to int -- > {text_as_int_array[:13]}')

[' ', '!', '"', '#', "'", ',', '-', '.', '. .', '. . .', '. . .  . . .', '. . . .', '. . . . .', '. . . . . . . . . . . .', '. . ..', '..', '...', '?', 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '_', '__', '___', '~']
91 unique characters
['#', ' ', 'L', 'EH1', 'T', 'ER0', ' ', 'S', 'EH1', 'V', 'AH0', 'N', ' '] -- mapped to int -- > [ 3  0 60 41 74 43  0 72 41 82 24 62  0]


In [8]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(all_poems_phonemes)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int_array)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [9]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512 

# Number of lstm layers
num_gru_layers = 1
gru_dropout = .1

is_bidirectional = True

# Number of dense layers
num_dense_layers = 1
dense_dropout = 0

In [10]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
num_elems = len(list(dataset.as_numpy_iterator()))
val = .1
val_dataset = dataset.take(int(num_elems*val))
train_dataset = dataset.skip(int(num_elems*val))

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, Dropout

def conditional_bidirection(layer, is_birdirectional):
    if(is_bidirectional):
        return Bidirectional(layer)
    else:
        return layer
    
def build_model(vocab_size, embedding_dim, rnn_units, batch_size, num_gru_layers):
    model = tf.keras.Sequential()
    model.add(Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]))
    for i in range(num_gru_layers):
        model.add(conditional_bidirection(GRU(rnn_units,
                    return_sequences=True,
                    stateful=True,
                    recurrent_initializer='glorot_uniform'),
                  is_bidirectional)
                 )
    if(gru_dropout>0):
        model.add(Dropout(gru_dropout))
    for i in range(num_dense_layers):
        model.add(Dense(vocab_size))
        if(dense_dropout>0):
            model.add(Dropout(dense_dropout))
    return model

In [12]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE, 
  num_gru_layers=num_gru_layers
)

In [13]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 91) # (batch_size, sequence_length, vocab_size)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           23296     
_________________________________________________________________
bidirectional (Bidirectional (64, None, 1024)          2365440   
_________________________________________________________________
dropout (Dropout)            (64, None, 1024)          0         
_________________________________________________________________
dense (Dense)                (64, None, 91)            93275     
Total params: 2,482,011
Trainable params: 2,482,011
Non-trainable params: 0
_________________________________________________________________


In [15]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 91)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.512229


In [16]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [18]:
EPOCHS=10
model_name = f"{'b' if is_bidirectional else ''}gru_{num_gru_layers}l_{BATCH_SIZE}b_{rnn_units}u_{embedding_dim}e_{gru_dropout}d_dense_{num_dense_layers}l_{dense_dropout}d_{EPOCHS}epochs_{str(time.time()//1)}"
log_dir=f"logs/fit/{model_name}"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
%tensorboard --logdir logs/fit
history = model.fit(train_dataset, validation_data=val_dataset, epochs=EPOCHS, steps_per_epoch=180, callbacks=[checkpoint_callback, tensorboard_callback])


Reusing TensorBoard on port 6006 (pid 9693), started 4:00:06 ago. (Use '!kill 9693' to kill it.)

Train for 180 steps, validate for 142 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 12/180 [=>............................] - ETA: 2:46 - loss: 0.5824 - accuracy: 0.8826

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1, lstm_layers=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

def key_or_closest(word):
    try:
        return phoneme_word_dict[word]
    except:
        keys = set(phoneme_word_dict.keys())
        while(word not in keys):
            word=word[:-1]
            if len(word) == 1:
                word = '#'
                break
        return phoneme_word_dict[word]
    
def generate_text(model, start_string):
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [character_index_map[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Experiment to find the best setting.
  temperature = 1

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index_character_map[predicted_id])

  ph_text = (str(start_string) + ''.join(text_generated))
  print(ph_text)
  return " ".join([key_or_closest(word) for word in ph_text.split(" ")])

In [None]:
print(generate_text(model, start_string='# '))