In [1]:
# Step 1: Preprocess the Data
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import get_file

# Load Shakespeare's Text
url = 'https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt'
path_to_file = get_file('shakespeare.txt', url)

# Read the text file
with open(path_to_file, 'r') as file:
    text = file.read()

# Get the list of unique characters in the text
vocab = sorted(set(text))
print(f"Total unique characters: {len(vocab)}")

# Map characters to integers
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

# Encode the entire text as integers
text_as_int = np.array([char2idx[c] for c in text])

# Set the sequence length for training
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

# Create sequences of characters and their corresponding labels
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# Create input-output pairs
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Batch and shuffle the dataset
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Print some data to verify
for input_example, target_example in dataset.take(1):
    print(f"Input Example: {input_example}")
    print(f"Target Example: {target_example}")
#Explanation of Preprocessing:
#We first load the Shakespeare dataset from a URL using get_file().
#Then, we read the file and get all unique characters in the text.
#We map each character to a unique integer (char2idx) and vice versa (idx2char).
#The text is then converted into a sequence of integers, where each integer represents a unique character.
#We create sequences of seq_length characters, where the input is the first seq_length characters, and the target is the next character after that sequence.
#The data is batched and shuffled to prepare for training.

Downloading data from https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt
[1m5458199/5458199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2us/step
Total unique characters: 91
Input Example: [[ 1 81 62 ... 64 69 13]
 [73 26  1 ...  1 63 86]
 [46 75  1 ... 79 76 79]
 ...
 [ 1 73 70 ... 76 82  1]
 [40  7 73 ...  1 72 75]
 [ 1 62 73 ... 82 80 66]]
Target Example: [[81 62 63 ... 69 13  1]
 [26  1 71 ... 63 86 12]
 [75  1 62 ... 76 79  1]
 ...
 [73 70 67 ... 82  1 79]
 [ 7 73 73 ... 72 75 76]
 [62 73 73 ... 80 66  1]]


In [4]:
from tensorflow.keras import layers, models

def build_model(vocab_size, seq_length):
    """
    Builds an LSTM model for text generation.
    
    Args:
        vocab_size (int): Size of the vocabulary (number of unique characters).
        seq_length (int): Length of the input sequences.
    
    Returns:
        model: A compiled LSTM model.
    """
    model = models.Sequential([
        # Embedding layer to convert input integer sequence to dense vector representations
        layers.Embedding(vocab_size, 256, input_length=seq_length),
        
        # LSTM layer to capture long-term dependencies in the sequence
        layers.LSTM(1024, return_sequences=True, stateful=False, recurrent_initializer='glorot_uniform'),
        
        # Dense layer to output the probabilities for each character in the vocabulary
        layers.Dense(vocab_size)
    ])
    
    # Compile the model with the Adam optimizer and Sparse Categorical Crossentropy loss function
    model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
    return model

# Let's define the sequence length and vocab size
seq_length = 100  # Define the sequence length for each input sequence
vocab_size = len(vocab)  # Length of unique characters in the Shakespeare dataset

# Build the model
model = build_model(vocab_size, seq_length)

# Display model summary to check architecture
model.summary()




In [8]:
import numpy as np

def prepare_sequences(text, seq_length):
    """
    Converts the raw text into sequences of input-output pairs for training.
    
    Args:
        text (str): The raw Shakespeare text.
        seq_length (int): Length of the input sequences.
    
    Returns:
        x (array): Input sequences (each sequence of `seq_length` characters).
        y (array): Corresponding output characters (next character in sequence).
        char_to_idx (dict): Mapping from character to index.
        idx_to_char (dict): Mapping from index to character.
        vocab_size (int): Size of the vocabulary (number of unique characters).
    """
    chars = sorted(list(set(text)))  # List of unique characters
    vocab_size = len(chars)
    
    char_to_idx = {char: idx for idx, char in enumerate(chars)}  # Map characters to indices
    idx_to_char = {idx: char for idx, char in enumerate(chars)}  # Map indices to characters
    
    text_as_int = [char_to_idx[char] for char in text]  # Convert text to a sequence of indices
    
    # Create sequences of `seq_length` and their corresponding targets (next character)
    x = []
    y = []
    for i in range(0, len(text_as_int) - seq_length, 1):
        seq_in = text_as_int[i:i+seq_length]  # Sequence of input characters
        seq_out = text_as_int[i+seq_length]  # The next character (output)
        x.append(seq_in)
        y.append(seq_out)
    
    return np.array(x), np.array(y), char_to_idx, idx_to_char, vocab_size

# Load your Shakespeare text (you must have it in a variable or file)
with open("shakespeare_raw.txt", "r") as file:
    text = file.read()

# Prepare the sequences
seq_length = 100  # Length of each input sequence
x, y, char_to_idx, idx_to_char, vocab_size = prepare_sequences(text, seq_length)

# Reshape the inputs to be compatible with the LSTM input format (batch_size, seq_length)
x = np.expand_dims(x, axis=-1)  # This is to ensure the correct input shape

# Train/test split (optional, for validation)
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)

# Check the shapes of the prepared data
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")


FileNotFoundError: [Errno 2] No such file or directory: 'shakespeare_raw.txt'

In [None]:
from tensorflow.keras import layers, models

def build_model(vocab_size, seq_length, batch_size):
    """
    Builds the LSTM model for text generation.
    
    Args:
        vocab_size (int): Size of the vocabulary (number of unique characters).
        seq_length (int): Length of input sequences.
        batch_size (int): The batch size used in training.
    
    Returns:
        model: A compiled LSTM model.
    """
    model = models.Sequential([
        layers.Embedding(vocab_size, 256, batch_input_shape=[batch_size, seq_length]),
        layers.LSTM(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        layers.Dense(vocab_size)
    ])
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Build the model
batch_size = 64
model = build_model(vocab_size, seq_length, batch_size)

# Display the model summary
model.summary()


In [9]:
# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val))

# Display training history (loss curve)
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.title('Model Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()


NameError: name 'x_train' is not defined

In [10]:
def generate_text(model, seed, length=500):
    """
    Generates text using the trained model by predicting the next character in the sequence.
    
    Args:
        model: The trained LSTM model.
        seed (str): The seed text to begin the generation.
        length (int): The length of the generated text.
    
    Returns:
        generated_text (str): The generated text.
    """
    generated_text = seed
    input_seq = [char_to_idx[char] for char in seed]
    input_seq = np.expand_dims(input_seq, axis=0)
    
    # Generate characters one by one
    for i in range(length):
        # Predict next character probabilities
        pred = model.predict(input_seq)
        
        # Get the index of the most likely next character
        pred_idx = np.argmax(pred[0, -1, :])
        
        # Convert index to character
        pred_char = idx_to_char[pred_idx]
        
        # Append predicted character to generated text
        generated_text += pred_char
        
        # Update input sequence by shifting it
        input_seq = np.roll(input_seq, shift=-1, axis=-1)
        input_seq[0, -1] = pred_idx
    
    return generated_text

# Generate text from a random seed
seed = "Shall I compare thee to a summer's day?"
generated_text = generate_text(model, seed, length=500)
print(generated_text)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 450ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [15]:
#Experiment with GRU Layer:
#We will now replace the LSTM layer with a GRU (Gated Recurrent Unit) layer to see if it improves performance. GRU is another type of recurrent neural network (RNN) cell, which has a similar purpose to LSTM but with fewer parameters, and in some cases, it can be more efficient.

#GRU Layer Model:

    import tensorflow as tf
from tensorflow.keras import layers, models

def build_gru_model(vocab_size, seq_length, batch_size):
    """
    Builds a GRU model for text generation.

    Args:
        vocab_size (int): Number of unique characters in the text dataset.
        seq_length (int): The length of input sequences.
        batch_size (int): The batch size used during training.

    Returns:
        model: The compiled GRU model.
    """
    model = models.Sequential([
        # Embedding layer to convert input sequences into dense vectors
        layers.Embedding(vocab_size, 256, batch_input_shape=[batch_size, seq_length]),

        # GRU layer to learn long-term dependencies in sequences
        layers.GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),

        # Dense layer to output probabilities for each character
        layers.Dense(vocab_size)
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Example usage
vocab_size = 80  # Size of the vocabulary (adjust this)
seq_length = 100  # Length of the input sequences (adjust as needed)
batch_size = 64  # Batch size (adjust this based on system memory)

# Build and compile the GRU model
gru_model = build_gru_model(vocab_size, seq_length, batch_size)

# Display the model summary
gru_model.summary()


IndentationError: unexpected indent (1744147019.py, line 6)

In [16]:
#input data preprocessing
import numpy as np

def preprocess_text(text, seq_length):
    """
    Preprocesses the input text for training by creating sequences of integers.
    
    Args:
        text (str): The raw text to be processed (e.g., Shakespeare's text).
        seq_length (int): Length of input sequences for training.
        
    Returns:
        x (ndarray): Input sequences of shape (num_sequences, seq_length).
        y (ndarray): Target sequences (next character) of shape (num_sequences,).
    """
    vocab = sorted(set(text))  # Get unique characters from the text
    char_to_int = {char: i for i, char in enumerate(vocab)}  # Map each char to an integer
    int_to_char = {i: char for i, char in enumerate(vocab)}  # Reverse map
    
    # Generate input-output sequences
    sequences = []
    for i in range(0, len(text) - seq_length, seq_length):
        seq_in = text[i:i+seq_length]
        seq_out = text[i+seq_length]
        sequences.append([char_to_int[char] for char in seq_in], char_to_int[seq_out])
    
    x = np.array([seq[0] for seq in sequences])
    y = np.array([seq[1] for seq in sequences])
    
    return x, y, char_to_int, int_to_char, vocab

# Example of how to preprocess text:
text = open('shakespeare.txt', 'r').read()  # Load your dataset here (Shakespeare text file)
seq_length = 100  # Adjust this to your desired sequence length
x_train, y_train, char_to_int, int_to_char, vocab = preprocess_text(text, seq_length)

# Now, reshape x_train for input to the model
x_train = x_train.reshape((x_train.shape[0], seq_length))

# Convert to tensors
x_train = tf.convert_to_tensor(x_train, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)


TypeError: list.append() takes exactly one argument (2 given)

In [17]:
# Train the GRU model
gru_history = gru_model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val))

# Plot the training history (optional)
import matplotlib.pyplot as plt

plt.plot(gru_history.history['loss'])
plt.title('GRU Model Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()


NameError: name 'gru_model' is not defined

In [18]:
#Text Generation (Prediction)
def generate_text(model, start_string, char_to_int, int_to_char, seq_length, num_generate=1000):
    """
    Generates text using the trained model starting from a given seed string.

    Args:
        model: The trained LSTM or GRU model.
        start_string: The string to start generating text from.
        char_to_int: Dictionary mapping characters to integers.
        int_to_char: Dictionary mapping integers back to characters.
        seq_length: Length of input sequences for the model.
        num_generate: Number of characters to generate after the start_string.

    Returns:
        generated_text: The generated text starting from the given start_string.
    """
    # Convert start string to integer sequence
    input_eval = [char_to_int[char] for char in start_string]
    input_eval = np.expand_dims(input_eval, 0)  # Convert to batch dimension

    # Generate characters one by one
    generated_text = start_string
    model.reset_states()

    for i in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predicted_id = tf.random.categorical(predictions[0], num_samples=1)[-1, 0].numpy()

        # Append the predicted character to the generated string
        generated_text += int_to_char[predicted_id]

        # Update the input sequence for the next prediction
        input_eval = np.expand_dims([predicted_id], 0)
    
    return generated_text

# Example usage
start_string = "Shall I compare thee to a summer's day?"
generated_text = generate_text(gru_model, start_string, char_to_int, int_to_char, seq_length=100, num_generate=500)
print(generated_text)


NameError: name 'gru_model' is not defined

In [19]:
#Experimenting with LSTM vs GRU
#You can compare GRU with LSTM by building a similar LSTM-based model. Here is the LSTM model construction:

#Building the LSTM Model:
def build_lstm_model(vocab_size, seq_length, batch_size):
    """
    Builds an LSTM model for text generation.

    Args:
        vocab_size (int): Number of unique characters in the text dataset.
        seq_length (int): The length of input sequences.
        batch_size (int): The batch size used during training.

    Returns:
        model: The compiled LSTM model.
    """
    model = models.Sequential([
        # Embedding layer to convert input sequences into dense vectors
        layers.Embedding(vocab_size, 256, batch_input_shape=[batch_size, seq_length]),

        # LSTM layer to learn long-term dependencies in sequences
        layers.LSTM(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),

        # Dense layer to output probabilities for each character
        layers.Dense(vocab_size)
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Example usage
lstm_model = build_lstm_model(vocab_size, seq_length, batch_size)

# Display the model summary
lstm_model.summary()


ValueError: Unrecognized keyword arguments passed to Embedding: {'batch_input_shape': [64, 100]}

In [20]:
# Train the LSTM model
lstm_history = lstm_model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val))

# Plot the training history (optional)
plt.plot(lstm_history.history['loss'])
plt.title('LSTM Model Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()


NameError: name 'lstm_model' is not defined

In [21]:
# Example usage with the LSTM model
generated_text_lstm = generate_text(lstm_model, start_string, char_to_int, int_to_char, seq_length=100, num_generate=500)
print(generated_text_lstm)


NameError: name 'lstm_model' is not defined

In [22]:
# Compare training losses (optional)
plt.plot(gru_history.history['loss'], label="GRU Loss")
plt.plot(lstm_history.history['loss'], label="LSTM Loss")
plt.title('Loss Comparison: GRU vs LSTM')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


NameError: name 'plt' is not defined