In [18]:
import re
import sys
import random
import numpy as np
import tensorflow as tf

In [7]:
# Load the sonnet text file and convert to lowercase
sonnet_text = open("./data/sonnet_data/Sonnet.txt", "r").read().lower()

# Create a set of unique characters present in the text
chars = sorted(list(set(sonnet_text)))

In [16]:
def remove_words_with_symbols(text):
    '''
    Removes words with symbols from text
    '''
    pattern = r'\b\w*[^a-zA-Z0-9\s]\w*\b'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def remove_inverted_commas(text):
    '''
    Removes inverted commas from text
    '''
    pattern = r'^"|"$'
    clean_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return clean_text

def remove_punctuation(text):
    '''
    Removes punctuation from text
    '''
    pattern = r'[^\w\s]'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def convert_to_lower(text):
    '''
    Converts text to lower case
    '''
    return text.lower()

In [20]:
sonnet_text = remove_words_with_symbols(sonnet_text)
sonnet_text = remove_inverted_commas(sonnet_text)
sonnet_text = remove_punctuation(sonnet_text)
sonnet_text = convert_to_lower(sonnet_text)

In [21]:
# Create dictionaries to map characters to indices and vice versa
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

In [22]:
# Create input sequences and output labels by sliding a window of length seq_length
seq_length = 40
step_size = 3
input_seqs = []
output_labels = []
for i in range(0, len(sonnet_text) - seq_length, step_size):
    input_seqs.append(sonnet_text[i:i+seq_length])
    output_labels.append(sonnet_text[i+seq_length])

In [23]:
# Convert input sequences and output labels to arrays of indices
X = np.zeros((len(input_seqs), seq_length, len(chars)), dtype=np.bool)
y = np.zeros((len(input_seqs), len(chars)), dtype=np.bool)
for i, input_seq in enumerate(input_seqs):
    for j, char in enumerate(input_seq):
        X[i, j, char_to_idx[char]] = 1
    y[i, char_to_idx[output_labels[i]]] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((len(input_seqs), seq_length, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(input_seqs), len(chars)), dtype=np.bool)


In [24]:
# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(seq_length, len(chars))),
    tf.keras.layers.Dense(len(chars), activation="softmax")
])

In [25]:
# Compile the model with categorical crossentropy loss and RMSprop optimizer
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [26]:
def generated_text():
    # Generate text after each epoch
    start_index = random.randint(0, len(sonnet_text) - seq_length - 1)
    generated_text = sonnet_text[start_index:start_index+seq_length]
    sys.stdout.write(generated_text)
    for i in range(400):
        X_pred = np.zeros((1, seq_length, len(chars)))
        for j, char in enumerate(generated_text):
            X_pred[0, j, char_to_idx[char]] = 1
        pred = model.predict(X_pred, verbose=0)[0]
        next_char = idx_to_char[np.argmax(pred)]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)
        sys.stdout.flush()

In [27]:
# Train the model for 50 epochs
for epoch in range(50):
    print(f"{epoch}/50, going strong!")
    model.fit(X, y, batch_size=128, epochs=1)
    # Generate text after each epoch
    start_index = random.randint(0, len(sonnet_text) - seq_length - 1)
    generated_text = sonnet_text[start_index:start_index+seq_length]
    sys.stdout.write(generated_text)
    for i in range(400):
        X_pred = np.zeros((1, seq_length, len(chars)))
        for j, char in enumerate(generated_text):
            X_pred[0, j, char_to_idx[char]] = 1
        pred = model.predict(X_pred, verbose=0)[0]
        next_char = idx_to_char[np.argmax(pred)]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

0/50, going strong!
he upon your soundless deep doth ride
or th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th th a h th
1/50, going strong!

KeyboardInterrupt: 

In [14]:
generated_text()

 love her, because thou knowâ€™st i lovej....:ca:oo:ommm!m!mm™™™™™™ppeepee)p¿;;;;»»
yyyyy,,,,,â,â,â,ââ,â,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ,ââ

In [23]:
model.save('./models/sonnet_generator.h5')