In [6]:
import tensorflow as tf
import numpy as np

def split_text_file(input_file, output_file1, output_file2, split_ratio=0.01):
    with open(input_file, "r", encoding="utf-8") as file:
        lines = file.readlines()

    split_index = int(len(lines) * split_ratio)  # Compute 20% split index

    # Write the first 20% to output_file1
    with open(output_file1, "w", encoding="utf-8") as file1:
        file1.writelines(lines[:split_index])

    # Write the remaining 80% to output_file2
    with open(output_file2, "w", encoding="utf-8") as file2:
        file2.writelines(lines[split_index:])

    print(f"✅ File successfully split into '{output_file1}' (5%) and '{output_file2}' (95%)")

# Example Usage
split_text_file("shakespeare.txt", "output_05.txt", "output_95.txt")


# Load dataset (Shakespeare's text as an example)
with open("output_05.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()  # Convert to lowercase for consistency

# Create character-to-index mapping
chars = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

# Convert text to numbers
def text_to_sequences(text, seq_length):
    input_sequences = []
    target_sequences = []
    for i in range(len(text) - seq_length):
        input_sequences.append([char_to_idx[char] for char in text[i:i+seq_length]])
        target_sequences.append(char_to_idx[text[i+seq_length]])
    return np.array(input_sequences), np.array(target_sequences)

SEQ_LENGTH = 100  # Length of input sequences
X, Y = text_to_sequences(text, SEQ_LENGTH)

# Normalize data (convert to float and scale)
vocab_size = len(chars)
X = tf.keras.utils.to_categorical(X, num_classes=vocab_size)  # One-hot encode input
Y = tf.keras.utils.to_categorical(Y, num_classes=vocab_size)

# Define the RNN Model with LSTM
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(256, input_shape=(SEQ_LENGTH, vocab_size), return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=["accuracy"])

# Train model
EPOCHS = 20
BATCH_SIZE = 64
#model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS)
# Train model with validation split (80% training, 20% validation)
model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)

# Save model
model.save("text_generator_rnn.h5")

✅ File successfully split into 'output_05.txt' (5%) and 'output_95.txt' (95%)


  super().__init__(**kwargs)


Epoch 1/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - accuracy: 0.2306 - loss: 2.9019 - val_accuracy: 0.3860 - val_loss: 2.0950
Epoch 2/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.3866 - loss: 2.1102 - val_accuracy: 0.4489 - val_loss: 1.8577
Epoch 3/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.4557 - loss: 1.8372 - val_accuracy: 0.4873 - val_loss: 1.7376
Epoch 4/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.5079 - loss: 1.6705 - val_accuracy: 0.4893 - val_loss: 1.6814
Epoch 5/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.5498 - loss: 1.4935 - val_accuracy: 0.5133 - val_loss: 1.6391
Epoch 6/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.5910 - loss: 1.3414 - val_accuracy: 0.5095 - val_loss: 1.6458
Epoch 7/20
[1m6



In [9]:
# Function to generate text
def generate_text(start_text, length=500):
    start_text = start_text.lower()  # Ensure lowercase consistency
    generated_text = start_text

    # Convert seed text into a sequence
    input_seq = [char_to_idx[char] for char in start_text]

    # Pad input sequence to SEQ_LENGTH
    if len(input_seq) < SEQ_LENGTH:
        input_seq = [0] * (SEQ_LENGTH - len(input_seq)) + input_seq  # Left-padding

    for _ in range(length):
        # Prepare input sequence
        input_data = tf.keras.utils.to_categorical([input_seq], num_classes=vocab_size)

        # Ensure correct shape
        input_data = np.reshape(input_data, (1, SEQ_LENGTH, vocab_size))

        # Predict next character
        predicted_probs = model.predict(input_data, verbose=0)
        predicted_idx = np.argmax(predicted_probs)

        # Append character to generated text
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update input sequence
        input_seq.append(predicted_idx)
        input_seq = input_seq[1:]  # Keep sequence length constant

    return generated_text

# Generate text using a seed phrase
print(generate_text("pizza eat", 500))


pizza eather new least of the fierce to the extext or any of the future is subject touch complete,
inaccurs, and earthed berit of recdies,
  and loathsome canker lives in sweete to grace,
  and summer's lease hath despite twine reclive,
    theirs for their style i'll read, and namy prousulled:
  then i ab presagere's distillation left
  and sable the orient when the gracious light
  lifts up his burning his grace, in wanting wast,
  and do whate'er thou wilt swift-footed time
  to the wide world and all
