In [2]:
#download the data and save it to poem.txt
!wget https://www.gutenberg.org/files/1661/1661-0.txt -O book.txt

--2024-11-16 14:30:09--  https://www.gutenberg.org/files/1661/1661-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607504 (593K) [text/plain]
Saving to: ‘book.txt’


2024-11-16 14:30:10 (1.93 MB/s) - ‘book.txt’ saved [607504/607504]



In [3]:
#read the file in text string
text = open('book.txt', 'r', encoding='utf-8').read()
text = text.lower()

In [4]:
#create list of sentences
sentences = text.split('\n')

In [6]:
#import dependencies to preprocess the text data and making sequences
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
#initialize the tokenizer, which can work char by char
tokenizer = Tokenizer(oov_token='<UNK>')

In [8]:
tokenizer.fit_on_texts(sentences)

In [9]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8923

In [11]:
sequences = tokenizer.texts_to_sequences(sentences)

input_sequences = []
for sequence in sequences:
  for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [12]:
print(input_sequences[0], input_sequences[1], input_sequences[2], input_sequences[3])

[4776, 158] [4776, 158, 331] [4776, 158, 331, 886] [4776, 158, 331, 886, 5]


In [13]:
#find the maximum length among sequences
max_seq_len = max([len(seq) for seq in input_sequences])
max_seq_len

20

In [14]:
#we will keep the last value of the sequence as our target label and all values before that as input to sequence model

In [15]:
#pad the sequences to ensure that they are all of same length
padded_sequences = pad_sequences(input_sequences, maxlen = max_seq_len)

In [16]:
print(padded_sequences[0], padded_sequences[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 4776  158] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 4776  158  331]


In [17]:
import numpy as np
padded_sequences = np.array(padded_sequences)

In [18]:
print(len(padded_sequences[0]))
print(len(padded_sequences[1]))

20
20


In [19]:
#prepare training sequences and labels
x = padded_sequences[:, : -1]
labels = padded_sequences[:, -1]

In [20]:
labels.shape

(101523,)

In [21]:
#to one hot encode the labels
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [22]:
x.shape

(101523, 19)

In [23]:
#x = x.reshape(x.shape[0], x.shape[1], 1)

In [24]:
y.shape

(101523, 8923)

In [25]:
#import dependencies for defining the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [26]:
#define and compile the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_seq_len-1))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
model.summary()



In [27]:
#initialize the callback for early stopping the training if there is not at least 1% improvement in the accuracy
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'acc', min_delta=0.01)

In [28]:
model.fit(x, y, epochs=50, verbose=1, batch_size=512, callbacks=[es])

Epoch 1/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - acc: 0.0662 - loss: 6.6566
Epoch 2/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 42ms/step - acc: 0.1447 - loss: 5.2912
Epoch 3/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 41ms/step - acc: 0.1804 - loss: 4.6561
Epoch 4/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - acc: 0.2159 - loss: 4.0818
Epoch 5/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - acc: 0.2700 - loss: 3.5485
Epoch 6/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - acc: 0.3298 - loss: 3.0951
Epoch 7/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 42ms/step - acc: 0.3817 - loss: 2.7459
Epoch 8/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - acc: 0.4369 - loss: 2.4485
Epoch 9/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x7b1ff841efe0>

In [31]:
#Time to become storyteller!
import numpy as np

seed_text = "I could not help laughing at the ease with which he explained his process of deduction"
next_words = 100

for _ in range(next_words):
  sequence = tokenizer.texts_to_sequences([seed_text])
  padded = pad_sequences(sequence, maxlen=max_seq_len-1)
  predicted_probs = model.predict(padded, verbose=0) # Get predicted probabilities
  predicted = np.argmax(predicted_probs, axis=-1)  # Get the class with highest probability
  output_word = ''
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += ' ' + output_word
print(seed_text)

I could not help laughing at the ease with which he explained his process of deduction when he saw her last home after “g” and made up my position upon “g” i “g” a “g” and then i read that i had just taken down “g” when “g” as “g” as i could “g” catch the honour of “g” in favour of the “g” and i could “g” into “g” mind to my position in the handling of large “g” which i “g” at the houses “g” only three “g” upon the “g” and a “g” ready to the “g” ” and the prisoner “g” with a chair beside the light holding it the passage “g”
