In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re


In [32]:
import random

# Load the text file
file_path = "shakespeare.txt"  # Change this to your file name

# Read all lines
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Shuffle the lines (optional, for randomness)
random.shuffle(lines)

# Calculate the split index
split_index = int(0.60 * len(lines))  # 80% training, 20% testing

# Split the data
train_lines = lines[:split_index]
test_lines = lines[split_index:]

# Save to new files
with open("train.txt", "w", encoding="utf-8") as f:
    f.writelines(train_lines)

with open("test.txt", "w", encoding="utf-8") as f:
    f.writelines(test_lines)

print(f"Split complete! Training set: {len(train_lines)} lines, Test set: {len(test_lines)} lines.")


Split complete! Training set: 74673 lines, Test set: 49783 lines.


In [33]:
# Load the Shakespeare dataset
file_path = "test.txt"  # Ensure the file is in the correct directory
with open(file_path, "r", encoding="utf-8") as f:
    shakespeare_text = f.read()

# Preprocess dataset (cleaning)
shakespeare_lines = shakespeare_text.split("\n")
shakespeare_lines = [line.strip() for line in shakespeare_lines if len(line) > 10]  # Remove short lines


In [34]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(shakespeare_lines)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(shakespeare_lines)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Maximum sequence length
max_sequence_length = max(len(seq) for seq in sequences)

# Padding sequences
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding="pre")

# Split input and target
X_train = sequences[:, :-1]  # All words except last
y_train = sequences[:, -1]   # Last word (target)

# One-hot encode the target output
y_train = tf.keras.utils.to_categorical(y_train, num_classes=vocab_size)


In [35]:
latent_dim = 256  # LSTM units

# Define a simple LSTM model
model = Sequential([
    Embedding(vocab_size, latent_dim, input_length=max_sequence_length-1),
    LSTM(latent_dim, return_sequences=False),
    Dense(vocab_size, activation="softmax")
])

# Compile model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()


In [36]:
# Train the model
model.fit(X_train, y_train, batch_size=128, epochs=40)


Epoch 1/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.0122 - loss: 8.5159
Epoch 2/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.0291 - loss: 7.4561
Epoch 3/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.0497 - loss: 6.9414
Epoch 4/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.0634 - loss: 6.3949
Epoch 5/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.0813 - loss: 5.9255
Epoch 6/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.1069 - loss: 5.4101
Epoch 7/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.1445 - loss: 4.9272
Epoch 8/40
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.2022 - loss: 4.4599
Epoch 9/40
[1m355/355[0m [3

<keras.src.callbacks.history.History at 0x7abaf5fca750>

In [37]:
def generate_text(seed_text, max_length=50, temperature=1.0):
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([seed_text])
        sequence = pad_sequences(sequence, maxlen=max_sequence_length-1, padding="pre")  # Fix: Use "pre" padding

        predicted_probs = model.predict(sequence, verbose=0)[0]

        # Apply temperature sampling
        predicted_probs = np.log(predicted_probs + 1e-8) / temperature
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))

        predicted_index = np.random.choice(range(vocab_size), p=predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "")

        if not predicted_word:
            break

        seed_text += " " + predicted_word

    return seed_text


In [38]:
print(generate_text("AI is fun learning"))  # Example output: "To be or not to be, that is the question"
#print(generate_text("Shall I compare thee"))  # Example output: "Shall I compare thee to a summer's day"
#print(generate_text("All the world's a"))  # Example output: "All the world's a stage, and all the men and women merely players"


AI is fun learning by antony yourself for't antony i am alone ' i'll be that in my chamber when they are a doing this cat of a thousand rich thrive he heads with myself to come to him to a mouth claudio who horns 'em a interlude man saw cold go to him


In [39]:
print(generate_text("love is good or bad"))  # Example output: "Shall I compare thee to a summer's day"

love is good or bad circumstances be forc'd you be what no more devil are but to be angel bail myself both never for your promise is meant my lord is boy dead thou for me forswear me for a blanket a i say no no means will me not me not this boy let
