In [1]:
import tensorflow as tf
import numpy as np

def split_text_file(input_file, output_file1, output_file2, split_ratio=0.001):
    with open(input_file, "r", encoding="utf-8") as file:
        lines = file.readlines()

    split_index = int(len(lines) * split_ratio)  # Compute 20% split index

    # Write the first 20% to output_file1
    with open(output_file1, "w", encoding="utf-8") as file1:
        file1.writelines(lines[:split_index])

    # Write the remaining 80% to output_file2
    with open(output_file2, "w", encoding="utf-8") as file2:
        file2.writelines(lines[split_index:])

    print(f"✅ File successfully split into '{output_file1}' (20%) and '{output_file2}' (80%)")

# Example Usage
split_text_file("shakespeare.txt", "output_20.txt", "output_80.txt")

✅ File successfully split into 'output_20.txt' (20%) and 'output_80.txt' (80%)


In [2]:
# Load dataset (Shakespeare's text as an example)
with open("output_20.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()  # Convert to lowercase for consistency


In [3]:
text

'this is the 100th etext file presented by project gutenberg, and\nis presented in cooperation with world library, inc., from their\nlibrary of the future and shakespeare cdroms.  project gutenberg\noften releases etexts that are not placed in the public domain!!\n\nshakespeare\n\n*this etext has certain copyright implications you should read!*\n\n<<this electronic version of the complete works of william\nshakespeare is copyright 1990-1993 by world library, inc., and is\nprovided by project gutenberg etext of illinois benedictine college\nwith permission.  electronic and machine readable copies may be\ndistributed so long as such copies (1) are for your or others\npersonal use only, and (2) are not distributed or used\ncommercially.  prohibited commercial distribution includes by any\nservice that charges for download time or for membership.>>\n\n*project gutenberg is proud to cooperate with the world library*\nin the presentation of the complete works of william shakespeare\nfor your

In [4]:
# Create character-to-index mapping
chars = sorted(set(text))

In [5]:
chars

['\n',
 ' ',
 '!',
 '"',
 '#',
 '%',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '<',
 '=',
 '>',
 '@',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [6]:
char_to_idx = {char: idx for idx, char in enumerate(chars)}
char_to_idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '%': 5,
 '(': 6,
 ')': 7,
 '*': 8,
 ',': 9,
 '-': 10,
 '.': 11,
 '/': 12,
 '0': 13,
 '1': 14,
 '2': 15,
 '3': 16,
 '4': 17,
 '5': 18,
 '6': 19,
 '7': 20,
 '8': 21,
 '9': 22,
 ':': 23,
 '<': 24,
 '=': 25,
 '>': 26,
 '@': 27,
 '[': 28,
 ']': 29,
 'a': 30,
 'b': 31,
 'c': 32,
 'd': 33,
 'e': 34,
 'f': 35,
 'g': 36,
 'h': 37,
 'i': 38,
 'j': 39,
 'k': 40,
 'l': 41,
 'm': 42,
 'n': 43,
 'o': 44,
 'p': 45,
 'r': 46,
 's': 47,
 't': 48,
 'u': 49,
 'v': 50,
 'w': 51,
 'x': 52,
 'y': 53,
 'z': 54}

In [7]:
idx_to_char = {idx: char for idx, char in enumerate(chars)}
idx_to_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: '#',
 5: '%',
 6: '(',
 7: ')',
 8: '*',
 9: ',',
 10: '-',
 11: '.',
 12: '/',
 13: '0',
 14: '1',
 15: '2',
 16: '3',
 17: '4',
 18: '5',
 19: '6',
 20: '7',
 21: '8',
 22: '9',
 23: ':',
 24: '<',
 25: '=',
 26: '>',
 27: '@',
 28: '[',
 29: ']',
 30: 'a',
 31: 'b',
 32: 'c',
 33: 'd',
 34: 'e',
 35: 'f',
 36: 'g',
 37: 'h',
 38: 'i',
 39: 'j',
 40: 'k',
 41: 'l',
 42: 'm',
 43: 'n',
 44: 'o',
 45: 'p',
 46: 'r',
 47: 's',
 48: 't',
 49: 'u',
 50: 'v',
 51: 'w',
 52: 'x',
 53: 'y',
 54: 'z'}

In [8]:
len(text) - 100

4718

In [9]:
text[0:100]

'this is the 100th etext file presented by project gutenberg, and\nis presented in cooperation with wo'

In [10]:
text[100]

'r'

In [11]:
char_to_idx["t"]

48

In [12]:
# Convert text to numbers
def text_to_sequences(text, seq_length):
    input_sequences = []
    target_sequences = []
    for i in range(len(text) - seq_length):
        input_sequences.append([char_to_idx[char] for char in text[i:i+seq_length]])
        target_sequences.append(char_to_idx[text[i+seq_length]])
        #print("*****************input seq*********************************************")
        #print(input_sequences)
        #print(f"The length of input seq is {len(input_sequences)}")
        #print("*****************target seq*********************************************")
        #print(target_sequences)
        #print(f"The length of target seq is {len(target_sequences)}")
        #print("--------------")
    return np.array(input_sequences), np.array(target_sequences)

SEQ_LENGTH = 100  # Length of input sequences
X, Y = text_to_sequences(text, SEQ_LENGTH)

In [13]:
len(X),len(Y)

(4718, 4718)

In [14]:
# Normalize data (convert to float and scale)
vocab_size = len(chars)
vocab_size

55

In [15]:
X = tf.keras.utils.to_categorical(X, num_classes=vocab_size)  # One-hot encode input
X

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [16]:
len(X)

4718

In [17]:
Y = tf.keras.utils.to_categorical(Y, num_classes=vocab_size)

###Why stack LSTMs?
Stacking multiple LSTM layers allows the model to learn more complex features and patterns. The first LSTM layer captures low-level features of the sequence, while the second LSTM layer can combine those features into higher-level representations.

🔹 Summary of the Flow:

First LSTM layer processes the input sequence and outputs the hidden states for each time step.

Second LSTM layer processes the hidden states from the first LSTM and outputs only the final hidden state.

The final hidden state is passed through a Dense layer to produce a vector of probabilities, where each value corresponds to the probability of a specific character in the vocabulary.

In [18]:
# Define the RNN Model with LSTM
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(256, input_shape=(SEQ_LENGTH, vocab_size), return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), metrics=["accuracy"])

# Train model
EPOCHS = 20
BATCH_SIZE = 64
#model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS)
# Train model with validation split (80% training, 20% validation)
model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)

  super().__init__(**kwargs)


Epoch 1/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 1s/step - accuracy: 0.1320 - loss: 3.3772 - val_accuracy: 0.1356 - val_loss: 3.3317
Epoch 2/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 1s/step - accuracy: 0.1488 - loss: 3.1407 - val_accuracy: 0.1186 - val_loss: 3.2888
Epoch 3/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 1s/step - accuracy: 0.1812 - loss: 3.0405 - val_accuracy: 0.1716 - val_loss: 3.1574
Epoch 4/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 1s/step - accuracy: 0.2426 - loss: 2.7463 - val_accuracy: 0.2066 - val_loss: 3.0447
Epoch 5/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 1s/step - accuracy: 0.2972 - loss: 2.5092 - val_accuracy: 0.2256 - val_loss: 2.9743
Epoch 6/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.3476 - loss: 2.2906 - val_accuracy: 0.2701 - val_loss: 2.8638
Epoch 7/20
[1m59/59[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7df403fa0750>

In [19]:
# Save model
model.save("text_generator_rnn.h5")



In [20]:
# Function to generate text
def generate_text(start_text, length=500):
    start_text = start_text.lower()  # Ensure lowercase consistency
    generated_text = start_text

    # Convert seed text into a sequence
    input_seq = [char_to_idx[char] for char in start_text]

    # Pad input sequence to SEQ_LENGTH
    if len(input_seq) < SEQ_LENGTH:
        input_seq = [0] * (SEQ_LENGTH - len(input_seq)) + input_seq  # Left-padding

    for _ in range(length):
        # Prepare input sequence
        input_data = tf.keras.utils.to_categorical([input_seq], num_classes=vocab_size)

        # Ensure correct shape
        input_data = np.reshape(input_data, (1, SEQ_LENGTH, vocab_size))

        # Predict next character
        predicted_probs = model.predict(input_data, verbose=0)
        predicted_idx = np.argmax(predicted_probs)

        # Append character to generated text
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update input sequence
        input_seq.append(predicted_idx)
        input_seq = input_seq[1:]  # Keep sequence length constant

    return generated_text

# Generate text using a seed phrase
print(generate_text("pizza eat", 500))

pizza eato tat is at one toned more to les are to "project gutenberg (one page)

we produce about two million dollars for each hour we work.  the
fifty hours is one conservative estimate for how long it we take
to get any etext selected, entered, proofread, edited, copyright
searched and analyzed, the copyright letters written, etc.  this
projected audience is one hundred million readers, which is 10% of the future is a trademark (tm) of world library, inc., and and by charses!!

the goan of project gute


In [21]:
# Function to generate text
def generate_text(start_text, length=500):
    start_text = start_text.lower()  # Ensure lowercase consistency
    generated_text = start_text

    # Convert seed text into a sequence
    input_seq = [char_to_idx[char] for char in start_text]

    # Pad input sequence to SEQ_LENGTH
    if len(input_seq) < SEQ_LENGTH:
        input_seq = [0] * (SEQ_LENGTH - len(input_seq)) + input_seq  # Left-padding

    for _ in range(length):
        # Prepare input sequence
        input_data = tf.keras.utils.to_categorical([input_seq], num_classes=vocab_size)

        # Ensure correct shape
        input_data = np.reshape(input_data, (1, SEQ_LENGTH, vocab_size))

        # Predict next character
        predicted_probs = model.predict(input_data, verbose=0)
        predicted_idx = np.argmax(predicted_probs)

        # Append character to generated text
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update input sequence
        input_seq.append(predicted_idx)
        input_seq = input_seq[1:]  # Keep sequence length constant

    return generated_text

# Generate text using a seed phrase
print(generate_text("i love you", 500))

i love your donations more than ever!

all donations should be made to "project gutenberg etexts is at
midnight, central time, of the last day of the stated month.  a
preliminary version may often be posted for suggestion, comment
and editing by those who wish to do so bec wess
for your or others
personal use comperate with the world library, inc., and and by charses!!

the goan of project gutenberg (one page)

we produce about two million dollars for each hour we work.  the
fifty hours is one conservativ
