In [None]:
import numpy as np

## Data Preprocessing

In [None]:
def load_data():
  """Function to load the dataset"""
  with open('shakespeare-2.txt', mode='r', encoding='utf-8') as f:
    data = f.read() #data is a string that contains the text file
  return data

In [None]:
data = load_data()
words = data.split()
distinct_words = sorted(list(set(words))) # vocabulary
word_to_idx = dict((word, i) for i, word in enumerate(distinct_words)) #each word has an index
idx_to_word = dict((i, word) for i, word in enumerate(distinct_words)) #each index has a word. useful for text generation

In [None]:
# Define constants
N_seq = 50 # Length of the input sequence to be fed
N_words = len(words)
N_vocab = len(distinct_words)
print(N_data, N_vocab)

In [None]:
x_train = []
y_train = []
for i in range(0, N_words - N_seq, 1):
  # Given x of 100 charcters (Input Sequence), predict the next character y (Conditional Probability)
	x = words[i:i+N_seq]
	y = words[i+N_seq]
	x_train.append([word_to_idx[x_i] for x_i in x])
	y_train.append(word_to_idx[y])

m = len(x_train)
assert m == len(y_train), "Length mismatch error"

In [None]:
from keras.utils import to_categorical

# One-hot-encoding the input data
for i in range(m):
  x_train[i] = to_categorical(x_train[i], num_classes=N_vocab)

# One-hot-coding the output values
y_train = to_categorical(y_train, num_classes=N_vocab)

# Reshaping x_train to be [samples, timesteps, features]
x_train = np.array(x_train).reshape(m, N_seq, N_vocab)

## Model Training

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

def build_model():
  model = Sequential()
  model.add(LSTM(512, input_shape=x_train[0].shape, return_sequences=True))
  model.add(LSTM(512, return_sequences=True))
  model.add(LSTM(512))
  model.add(Dense(y_train.shape[1], activation='softmax'))
  return model

In [None]:
model = build_model()
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
from keras.callbacks import ModelCheckpoint

# Callbacks:
PATH_SAVE = "shakespearean_generator_2.h5"
checkpoint = ModelCheckpoint(PATH_SAVE, monitor='loss', mode='min')
cb_list = [checkpoint]

# Fitting
history = model.fit(x_train, y_train, epochs=30, batch_size=128, callbacks=cb_list)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Generating Text

In [None]:
def generate(seed_words, ohed_seed, N_words):
    """
    seed_words: list of initial words (strings)
    ohed_seed: list of one-hot encoded vectors of those seed words
    N_words: number of new words to generate
    """
    x0 = ohed_seed.copy()
    generated_indices = [word_to_idx[word] for word in seed_words]

    for _ in range(N_words):
        x = np.array(x0).reshape(1, N_seq, N_vocab)
        probabilities = model.predict(x, verbose=0)  # predict next word
        idx = np.random.choice(N_vocab, p=probabilities.ravel())  # sample word index
        ohed_idx = to_categorical(idx, num_classes=N_vocab)  # OHE of that index
        x0.append(ohed_idx)
        generated_indices.append(idx)
        x0 = x0[1:]  # slide window

    # Convert indices back to words
    generated_words = [idx_to_word[i] for i in generated_indices]
    return ' '.join(generated_words)

In [None]:
initial_seed = "your awesome character is very powerful today".lower()
seed_words = initial_seed.split()

# Ensure all words are in the vocabulary
words_input = set(seed_words)
words_valid = set(word_to_idx.keys())
invalid_words = words_input.difference(words_valid)
if invalid_words:
    raise SyntaxError(f"Input contains invalid words: {invalid_words}")

# Truncate long sequences
if len(seed_words) > N_seq:
    seed_words = seed_words[-N_seq:]  # keep the last N_seq words

# Pad short sequences with a special token or just ' ' (space)
N_pad = max(N_seq - len(seed_words), 0)
seed_words = ['<PAD>'] * N_pad + seed_words

print("The seed words are:", seed_words)


The initial word is :                            YOUR AWESOME CHARACTER:


In [None]:
seed = [word_to_idx[word] for word in seed_words]
ohed_seed = [to_categorical(idx, num_classes=N_vocab) for idx in seed]

In [None]:
generated_sentence = generate(seed, ohed_seed, 500)[N_pad:] # Remove the prepended padding, if any

In [None]:
generated_sentence = ' '.join([idx_to_word[i] for i in generated_sentence])
print(generated_sentence)


In [None]:
model.save('shakespeare_final.h5')