In [1]:
#download the data and save it to poem.txt
!wget https://www.gutenberg.org/files/1661/1661-0.txt -O book.txt

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
#import wget
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from keras.models import load_model

In [4]:
wget.download('https://www.gutenberg.org/files/1661/1661-0.txt', 'book.txt')

100% [............................................................................] 607430 / 607430

'book.txt'

In [3]:
text = open('book.txt','r', encoding = 'utf-8').read()
text = text.lower()


In [4]:
sentences = text.split('\n')

In [5]:
tokenizer = Tokenizer(oov_token = '<UNK>')

In [6]:
#fit on texts
tokenizer.fit_on_texts(sentences)

In [7]:
#calculate vocab size
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8915

In [8]:
#convert texts into sewuences
sequences = tokenizer.texts_to_sequences(sentences)


In [9]:
#append all input sequences in form of x, x+1, x+2.........
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence)
print(input_sequences[0], input_sequences[1], input_sequences[2])

[4775, 145] [4775, 145, 132] [4775, 145, 132, 886]


In [10]:
#find the maximum length among sequences
max_seq_len = max([len(seq) for seq in input_sequences])
max_seq_len

20

In [11]:
#pad sequences
padded_sequences = pad_sequences(input_sequences, maxlen = max_seq_len)


In [12]:
print(padded_sequences[0])
print(padded_sequences[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 4775  145]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 4775  145  132]


In [12]:
padded_sequences = np.array(padded_sequences)

In [13]:
x = padded_sequences[:, :-1]    #X will be upto n-1
labels = padded_sequences[:, -1]      #Y will be nth

In [14]:
#convert it into one hot encoding
y = tf.keras.utils.to_categorical(labels, num_classes = vocab_size)

In [16]:
#Y shape will be batch size x vocab size
y.shape

(101575, 8915)

In [17]:
#X shape will be batch size x max seq len
x.shape

(101575, 19)

In [18]:
model = keras.Sequential([
    
    layers.Embedding(input_dim = vocab_size, output_dim = 100, input_length = max_seq_len - 1),
    
    layers.Bidirectional(layer = layers.LSTM(units = 256)),
    
    layers.Dense(units = vocab_size, activation = 'softmax'),
     
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           891500    
                                                                 
 bidirectional (Bidirectiona  (None, 512)              731136    
 l)                                                              
                                                                 
 dense (Dense)               (None, 8915)              4573395   
                                                                 
Total params: 6,196,031
Trainable params: 6,196,031
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [20]:
cb = EarlyStopping(monitor = 'accuracy', min_delta = 0.01, verbose = 1, patience = 2)

In [21]:
model.fit(x, y, epochs = 50, batch_size = 512, callbacks = [cb])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
 25/199 [==>...........................] - ETA: 3:46 - loss: 1.8985 - accuracy: 0.5672

KeyboardInterrupt: 

In [22]:
model.save('text_generation.h5')

In [15]:
loaded_model = load_model('text_generation.h5')

In [35]:
#Time to become storyteller!
seed_text = "I could not help laughing at the ease with which he explained his process of deduction"          
next_words = 100
  
for i in range(next_words):
    
    sequence = tokenizer.texts_to_sequences([seed_text])
    padded = pad_sequences(sequence, maxlen=max_seq_len-1)
    predicted = loaded_model.predict(padded, verbose=0)
    predicted = np.argmax(predicted)
    output_word = ''
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += ' ' + output_word
print(seed_text)

I could not help laughing at the ease with which he explained his process of deduction and i knew that he was in my room and in his room all was in a pitiable that during the man’s face as i am not convinced that i had no doubt that i am sorry that i should have been a jump in the matter ” said he “it makes a very sweet and shape of a man who is popular with a very obstinate man ” he said as he spoke and the old man entered in a low hill and no one could a man of the spring end in the southampton the table that the


In [28]:
sequence = tokenizer.texts_to_sequences([seed_text])
sequence

[[7, 60, 26, 259, 663, 22, 2, 3442, 19, 23, 12, 1215, 15, 1946, 5, 1695]]

In [29]:
padded = pad_sequences(sequence, maxlen=max_seq_len-1)
padded

array([[   0,    0,    0,    7,   60,   26,  259,  663,   22,    2, 3442,
          19,   23,   12, 1215,   15, 1946,    5, 1695]])

In [30]:
predicted = loaded_model.predict(padded, verbose=0)
predicted

array([[1.8601180e-10, 1.5920117e-10, 5.9361255e-04, ..., 1.3311574e-14,
        2.5395330e-10, 1.5523601e-12]], dtype=float32)

In [31]:
predicted = np.argmax(predicted)
predicted

3

In [32]:
for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word           
output_word

'and'

In [33]:
seed_text += ' ' + output_word
seed_text

'I could not help laughing at the ease with which he explained his process of deduction                                                                                                                                                                                                         and'