In [1]:
import tensorflow as tf
import numpy as np

def split_text_file(input_file, output_file1, output_file2, split_ratio=0.01):
    with open(input_file, "r", encoding="utf-8") as file:
        lines = file.readlines()

    split_index = int(len(lines) * split_ratio)  # Compute 20% split index

    # Write the first 20% to output_file1
    with open(output_file1, "w", encoding="utf-8") as file1:
        file1.writelines(lines[:split_index])

    # Write the remaining 80% to output_file2
    with open(output_file2, "w", encoding="utf-8") as file2:
        file2.writelines(lines[split_index:])

    print(f"✅ File successfully split into '{output_file1}' (20%) and '{output_file2}' (80%)")

# Example Usage
split_text_file("shakespeare.txt", "output_20.txt", "output_80.txt")

✅ File successfully split into 'output_20.txt' (20%) and 'output_80.txt' (80%)


In [2]:
# Load dataset (Shakespeare's text as an example)
with open("output_20.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()  # Convert to lowercase for consistency


In [3]:
text

'this is the 100th etext file presented by project gutenberg, and\nis presented in cooperation with world library, inc., from their\nlibrary of the future and shakespeare cdroms.  project gutenberg\noften releases etexts that are not placed in the public domain!!\n\nshakespeare\n\n*this etext has certain copyright implications you should read!*\n\n<<this electronic version of the complete works of william\nshakespeare is copyright 1990-1993 by world library, inc., and is\nprovided by project gutenberg etext of illinois benedictine college\nwith permission.  electronic and machine readable copies may be\ndistributed so long as such copies (1) are for your or others\npersonal use only, and (2) are not distributed or used\ncommercially.  prohibited commercial distribution includes by any\nservice that charges for download time or for membership.>>\n\n*project gutenberg is proud to cooperate with the world library*\nin the presentation of the complete works of william shakespeare\nfor your

In [4]:
# Create character-to-index mapping
chars = sorted(set(text))

In [5]:
chars

['\n',
 ' ',
 '!',
 '"',
 '#',
 '%',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '~']

In [6]:
char_to_idx = {char: idx for idx, char in enumerate(chars)}
char_to_idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '%': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '/': 13,
 '0': 14,
 '1': 15,
 '2': 16,
 '3': 17,
 '4': 18,
 '5': 19,
 '6': 20,
 '7': 21,
 '8': 22,
 '9': 23,
 ':': 24,
 ';': 25,
 '<': 26,
 '=': 27,
 '>': 28,
 '?': 29,
 '@': 30,
 '[': 31,
 ']': 32,
 '_': 33,
 'a': 34,
 'b': 35,
 'c': 36,
 'd': 37,
 'e': 38,
 'f': 39,
 'g': 40,
 'h': 41,
 'i': 42,
 'j': 43,
 'k': 44,
 'l': 45,
 'm': 46,
 'n': 47,
 'o': 48,
 'p': 49,
 'q': 50,
 'r': 51,
 's': 52,
 't': 53,
 'u': 54,
 'v': 55,
 'w': 56,
 'x': 57,
 'y': 58,
 'z': 59,
 '~': 60}

In [7]:
idx_to_char = {idx: char for idx, char in enumerate(chars)}
idx_to_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: '#',
 5: '%',
 6: "'",
 7: '(',
 8: ')',
 9: '*',
 10: ',',
 11: '-',
 12: '.',
 13: '/',
 14: '0',
 15: '1',
 16: '2',
 17: '3',
 18: '4',
 19: '5',
 20: '6',
 21: '7',
 22: '8',
 23: '9',
 24: ':',
 25: ';',
 26: '<',
 27: '=',
 28: '>',
 29: '?',
 30: '@',
 31: '[',
 32: ']',
 33: '_',
 34: 'a',
 35: 'b',
 36: 'c',
 37: 'd',
 38: 'e',
 39: 'f',
 40: 'g',
 41: 'h',
 42: 'i',
 43: 'j',
 44: 'k',
 45: 'l',
 46: 'm',
 47: 'n',
 48: 'o',
 49: 'p',
 50: 'q',
 51: 'r',
 52: 's',
 53: 't',
 54: 'u',
 55: 'v',
 56: 'w',
 57: 'x',
 58: 'y',
 59: 'z',
 60: '~'}

In [8]:
len(text) - 100

49689

In [9]:
text[0:100]

'this is the 100th etext file presented by project gutenberg, and\nis presented in cooperation with wo'

In [10]:
text[100]

'r'

In [11]:
char_to_idx["t"]

53

In [12]:
# Convert text to numbers
def text_to_sequences(text, seq_length):
    input_sequences = []
    target_sequences = []
    for i in range(len(text) - seq_length):
        input_sequences.append([char_to_idx[char] for char in text[i:i+seq_length]])
        target_sequences.append(char_to_idx[text[i+seq_length]])
        #print("*****************input seq*********************************************")
        #print(input_sequences)
        #print(f"The length of input seq is {len(input_sequences)}")
        #print("*****************target seq*********************************************")
        #print(target_sequences)
        #print(f"The length of target seq is {len(target_sequences)}")
        #print("--------------")
    return np.array(input_sequences), np.array(target_sequences)

SEQ_LENGTH = 100  # Length of input sequences
X, Y = text_to_sequences(text, SEQ_LENGTH)

In [13]:
len(X),len(Y)

(49689, 49689)

In [14]:
# Normalize data (convert to float and scale)
vocab_size = len(chars)
vocab_size

61

In [15]:
X = tf.keras.utils.to_categorical(X, num_classes=vocab_size)  # One-hot encode input
X

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [16]:
len(X)

49689

In [17]:
Y = tf.keras.utils.to_categorical(Y, num_classes=vocab_size)

###Why stack LSTMs?
Stacking multiple LSTM layers allows the model to learn more complex features and patterns. The first LSTM layer captures low-level features of the sequence, while the second LSTM layer can combine those features into higher-level representations.

🔹 Summary of the Flow:

First LSTM layer processes the input sequence and outputs the hidden states for each time step.

Second LSTM layer processes the hidden states from the first LSTM and outputs only the final hidden state.

The final hidden state is passed through a Dense layer to produce a vector of probabilities, where each value corresponds to the probability of a specific character in the vocabulary.

In [18]:
# Train model
EPOCHS = 20
BATCH_SIZE = 64

SEQ_LENGTH = 100  # Sequence length
LEARNING_RATE = 0.002

In [19]:
# Define SimpleRNN Model
simple_rnn_model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(256, input_shape=(SEQ_LENGTH, vocab_size), return_sequences=True),
    tf.keras.layers.SimpleRNN(256),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile SimpleRNN model
simple_rnn_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), metrics=["accuracy"])

# Train SimpleRNN Model
simple_rnn_model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)


  super().__init__(**kwargs)


Epoch 1/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 28ms/step - accuracy: 0.2030 - loss: 3.0805 - val_accuracy: 0.1830 - val_loss: 3.0289
Epoch 2/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.2062 - loss: 3.0693 - val_accuracy: 0.2310 - val_loss: 2.9559
Epoch 3/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 23ms/step - accuracy: 0.2035 - loss: 3.0571 - val_accuracy: 0.2310 - val_loss: 3.0168
Epoch 4/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.2079 - loss: 3.0592 - val_accuracy: 0.2310 - val_loss: 2.9966
Epoch 5/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.2083 - loss: 3.0687 - val_accuracy: 0.2310 - val_loss: 2.9600
Epoch 6/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.2030 - loss: 3.0640 - val_accuracy: 0.2310 - val_loss: 2.9915
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x7a9d990e7a50>

In [20]:
# Define GRU Model
gru_model = tf.keras.Sequential([
    tf.keras.layers.GRU(256, input_shape=(SEQ_LENGTH, vocab_size), return_sequences=True),
    tf.keras.layers.GRU(256),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile GRU model
gru_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), metrics=["accuracy"])

# Train GRU Model
gru_model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)


Epoch 1/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.3006 - loss: 2.5638 - val_accuracy: 0.4414 - val_loss: 1.8753
Epoch 2/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.4531 - loss: 1.8616 - val_accuracy: 0.4853 - val_loss: 1.7466
Epoch 3/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.5082 - loss: 1.6362 - val_accuracy: 0.4858 - val_loss: 1.7048
Epoch 4/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 22ms/step - accuracy: 0.5509 - loss: 1.4754 - val_accuracy: 0.5044 - val_loss: 1.6870
Epoch 5/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.5934 - loss: 1.3172 - val_accuracy: 0.5060 - val_loss: 1.6592
Epoch 6/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.6257 - loss: 1.1813 - val_accuracy: 0.5087 - val_loss: 1.7129
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x7a9d99c83d90>

In [21]:
# Define Bidirectional LSTM Model
bilstm_model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True), input_shape=(SEQ_LENGTH, vocab_size)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile Bidirectional LSTM model
bilstm_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), metrics=["accuracy"])

# Train Bidirectional LSTM Model
bilstm_model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)


  super().__init__(**kwargs)


Epoch 1/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 58ms/step - accuracy: 0.2611 - loss: 2.7570 - val_accuracy: 0.4189 - val_loss: 1.9590
Epoch 2/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 56ms/step - accuracy: 0.4253 - loss: 1.9663 - val_accuracy: 0.4802 - val_loss: 1.7567
Epoch 3/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 52ms/step - accuracy: 0.4875 - loss: 1.7313 - val_accuracy: 0.4992 - val_loss: 1.6700
Epoch 4/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 53ms/step - accuracy: 0.5348 - loss: 1.5300 - val_accuracy: 0.5087 - val_loss: 1.6341
Epoch 5/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 56ms/step - accuracy: 0.5790 - loss: 1.3741 - val_accuracy: 0.5264 - val_loss: 1.5877
Epoch 6/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 56ms/step - accuracy: 0.6166 - loss: 1.2304 - val_accuracy: 0.5158 - val_loss: 1.6308
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x7a9d946e9010>

In [22]:
# Define LSTM Model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.LSTM(256, input_shape=(SEQ_LENGTH, vocab_size), return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])

# Compile LSTM model
lstm_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), metrics=["accuracy"])

# Train LSTM Model
lstm_model.fit(X, Y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)


Epoch 1/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.2389 - loss: 2.8702 - val_accuracy: 0.4081 - val_loss: 2.0480
Epoch 2/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.4090 - loss: 2.0400 - val_accuracy: 0.4548 - val_loss: 1.8365
Epoch 3/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 25ms/step - accuracy: 0.4691 - loss: 1.7865 - val_accuracy: 0.4906 - val_loss: 1.7088
Epoch 4/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.5162 - loss: 1.6181 - val_accuracy: 0.5008 - val_loss: 1.6598
Epoch 5/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.5647 - loss: 1.4489 - val_accuracy: 0.5056 - val_loss: 1.6410
Epoch 6/20
[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 25ms/step - accuracy: 0.5951 - loss: 1.3175 - val_accuracy: 0.5200 - val_loss: 1.6336
Epoch 7/20
[1m6

<keras.src.callbacks.history.History at 0x7a9d946a5fd0>

In [23]:
# Save model
simple_rnn_model.save("simple_rnn_model.h5")
gru_model.save("gru_model.h5")
bilstm_model.save("bilstm_model.h5")
lstm_model.save("lstm_model.h5")



In [40]:
# Function to generate text
def generate_text(start_text, model, length=500):
    start_text = start_text.lower()  # Ensure lowercase consistency
    generated_text = start_text

    # Convert seed text into a sequence
    input_seq = [char_to_idx[char] for char in start_text]

    # Pad input sequence to SEQ_LENGTH
    if len(input_seq) < SEQ_LENGTH:
        input_seq = [0] * (SEQ_LENGTH - len(input_seq)) + input_seq  # Left-padding

    for _ in range(length):
        # Prepare input sequence
        input_data = tf.keras.utils.to_categorical([input_seq], num_classes=vocab_size)

        # Ensure correct shape
        input_data = np.reshape(input_data, (1, SEQ_LENGTH, vocab_size))

        # Predict next character
        predicted_probs = model.predict(input_data, verbose=0) # 'model' is now the actual model object
        predicted_idx = np.argmax(predicted_probs)

        # Append character to generated text
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update input sequence
        input_seq.append(predicted_idx)
        input_seq = input_seq[1:]  # Keep sequence length constant

    return generated_text

# Function to generate text
def generate_text(start_text, model, length=500):
    start_text = start_text.lower()  # Ensure lowercase consistency
    generated_text = start_text

    # Convert seed text into a sequence
    input_seq = [char_to_idx[char] for char in start_text]

    # Pad input sequence to SEQ_LENGTH
    if len(input_seq) < SEQ_LENGTH:
        input_seq = [0] * (SEQ_LENGTH - len(input_seq)) + input_seq  # Left-padding

    for _ in range(length):
        # Prepare input sequence
        input_data = tf.keras.utils.to_categorical([input_seq], num_classes=vocab_size)

        # Ensure correct shape
        input_data = np.reshape(input_data, (1, SEQ_LENGTH, vocab_size))

        # Predict next character
        predicted_probs = model.predict(input_data, verbose=0) # 'model' is now the actual model object
        predicted_idx = np.argmax(predicted_probs)

        # Append character to generated text
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update input sequence
        input_seq.append(predicted_idx)
        input_seq = input_seq[1:]  # Keep sequence length constant

    return generated_text

In [42]:
# Generate text using a seed phrase
print(f"SIMPLERNN: {generate_text('colour of sky is blue', simple_rnn_model, 500)}")
print(f"GRU: {generate_text('colour of sky is blue', gru_model, 500)}") # Pass the model object, not a string
print(f"BIDIRRNN: {generate_text('colour of sky is blue', bilstm_model, 500)}") # Pass the model object, not a string
print(f"LSTM: {generate_text('colour of sky is blue', lstm_model, 500)}") # Pass the model object, not a string

SIMPLERNN: colour of sky is blue                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
GRU: colour of sky is blue, and that you distribute electronic texts**


*****

chesected the conditions of the fils.


                                15
  no may i dis berest of the tomb,
  of his suggome converted and as the time
  to the waste happier than thou art.
  gentle thiempare thee, for still the loss,
  thou art thou get a your from love's loving parts,
  and moan th' expense of many a vanished sight.
  thand's form and to be gone,
  who plain the st