In [55]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load and preprocess sonnet data
sonnet_data = ""
with open("./data/sonnet_data/Sonnet.txt", "r") as file:
    sonnet_data = file.read()

sonnets = sonnet_data.split("\n\n")  # Split sonnets based on double lines


In [3]:
# Load and preprocess play data
play_data = ""
with open("./data/plays_data/alllines.txt", "r") as file:
    play_data = file.read()

play_dialogues = play_data.split("\n")  # Split dialogues based on new lines


In [16]:
def remove_words_with_symbols(text):
    '''
    Removes words with symbols from text
    '''
    pattern = r'\b\w*[^a-zA-Z0-9\s]\w*\b'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def remove_inverted_commas(text):
    '''
    Removes inverted commas from text
    '''
    pattern = r'^"|"$'
    clean_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return clean_text

def remove_punctuation(text):
    '''
    Removes punctuation from text
    '''
    pattern = r'[^\w\s]'
    clean_text = re.sub(pattern, '', text)
    return clean_text

def convert_to_lower(text):
    '''
    Converts text to lower case
    '''
    return text.lower()

In [7]:
for i in range(len(sonnets)):
    sonnets[i] = remove_words_with_symbols(sonnets[i])

In [11]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = remove_inverted_commas(play_dialogues[i])

In [14]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = remove_punctuation(play_dialogues[i])

for i in range(len(sonnets)):
    sonnets[i] = remove_punctuation(sonnets[i])

In [17]:
for i in range(len(play_dialogues)):
    play_dialogues[i] = convert_to_lower(play_dialogues[i])

for i in range(len(sonnets)):
    sonnets[i] = convert_to_lower(sonnets[i])

In [22]:
# Combine sonnet and play data
combined_data = sonnets + play_dialogues

In [23]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_data)
total_words = len(tokenizer.word_index) + 1

In [24]:
# Generate input sequences
input_sequences = []
for line in combined_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [25]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [59]:
# Split input sequences into input and output
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences)
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]

In [60]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(total_words/2, activation='relu'),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

In [61]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [63]:
# Train the model
history = model.fit(xs, labels, epochs=100)

Epoch 1/100
    5/22516 [..............................] - ETA: 35:27:13 - loss: 10.1526 - accuracy: 0.0375

KeyboardInterrupt: 