In [1]:
import re
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
# Load and preprocess sonnet data
sonnet_data = ""
with open("../data/sonnet_data/Sonnet.txt", "r") as file:
    sonnet_data = file.read()

sonnets = sonnet_data.split("\n\n")  # Split sonnets based on double lines


In [3]:
# Load and preprocess play data
play_data = ""
with open("../data/plays_data/alllines.txt", "r") as file:
    play_data = file.read()

play_dialogues = play_data.split("\n")  # Split dialogues based on new lines


In [4]:
def remove_words_with_symbols(text):
    '''
    Removes words with symbols from text
    '''
    pattern = r'\b\w*[^a-zA-Z0-9\s]\w*\b'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def remove_inverted_commas(text):
    '''
    Removes inverted commas from text
    '''
    pattern = r'^"|"$'
    clean_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return clean_text

def remove_punctuation(text):
    '''
    Removes punctuation from text
    '''
    pattern = r'[^\w\s]'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def convert_to_lower(text):
    '''
    Converts text to lower case
    '''
    return text.lower()

In [5]:
for i in range(len(sonnets)):
    sonnets[i] = remove_words_with_symbols(sonnets[i])

In [6]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = remove_inverted_commas(play_dialogues[i])

In [7]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = remove_punctuation(play_dialogues[i])

for i in range(len(sonnets)):
    sonnets[i] = remove_punctuation(sonnets[i])

In [8]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = convert_to_lower(play_dialogues[i])

for i in range(len(sonnets)):
    sonnets[i] = convert_to_lower(sonnets[i])

In [18]:
# Combine sonnet and play data
# combined_data = sonnets + play_dialogues
combined_data = sonnets

In [19]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_data)
total_words = len(tokenizer.word_index) + 1

In [20]:
# Generate input sequences
input_sequences = []
for line in combined_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [21]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [22]:
# Split input sequences into input and output
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences)
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]

In [23]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(total_words/2, activation='relu'),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

In [24]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
# Train the model
history = model.fit(xs, labels, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

In [26]:
import tensorflow as tf
import pickle

In [27]:
# model = tf.keras.models.load_model('../models/sonnet_generator.h5')

# Generate a sonnet
seed_text = "the love was flowered"
next_words = 10  # Number of words to generate in the sonnet
tokenizerLoaded = pickle.load(open('../tokenizer.pickle', 'rb'))

In [31]:
for _ in range(100 - len(seed_text.split())):
    token_list = tokenizerLoaded.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=129, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizerLoaded.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word



In [32]:
print(seed_text)

the love was flowered the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


In [36]:
model.save('new_model.h5')

In [38]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizerLoaded = pickle.load(handle)

In [40]:
max_sequence_len

163

In [93]:
def generate_sonnet(seed_text):
  MAX_SEQ_LEN = 163
  for i in range(14):
    token_list = tokenizerLoaded.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen = MAX_SEQ_LEN-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""

    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word
  return seed_text

In [98]:
pred1 = generate_sonnet("loving beyond")



In [99]:
pred

'love to the world and i will tell you what a man is not a'

In [100]:
pred1

'loving beyond the world and i will be a little of the world and i am'