In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os 

In [2]:
### Remove the unncessary characters
text_file = open('text file.txt', 'r', encoding='utf8')
lines = text_file.read()
data_into_list = data = lines.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')
data_into_list

# Remove the unnecessary spaces
data = data_into_list.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of Pride and Prejudice, by Jane AustenThis eBook is for the use of anyone anywhere in the United States andmost other parts of the world at no cost and with almost no restrictionswhatsoever. You may copy it, give it away or re-use it under the termsof the Project Gutenberg License included with this eBook or online atwww.gutenberg.org. If you are not located in the United States, youwill have to check the laws of the country where you are located beforeusing this eBoo'

##### Tokenization

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

### saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 223, 186, 912, 3, 328, 4, 1351, 30, 72, 4174, 912, 23, 21, 1]

In [4]:
len(sequence_data)

125076

In [5]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7200


##### We will use three words to preidct the next word

In [6]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print(f'The length of sequneces are : {len(sequences)}')
sequences[:10]

The length of sequneces are : 125073


[[1, 223, 186, 912],
 [223, 186, 912, 3],
 [186, 912, 3, 328],
 [912, 3, 328, 4],
 [3, 328, 4, 1351],
 [328, 4, 1351, 30],
 [4, 1351, 30, 72],
 [1351, 30, 72, 4174],
 [30, 72, 4174, 912],
 [72, 4174, 912, 23]]

##### Separate input and output

In [7]:
x = []
y = []

for i in sequences:
    x.append(i[0:3])
    y.append(i[3])

x = np.array(x)
y = np.array(y)


### Label the output categorically

In [8]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

##### Defining the model

In [9]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             72000     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7200)              7207200   
                                                                 
Total params: 20,328,200
Trainable params: 20,328,200
Non-trainable params: 0
_________________________________________________________________


##### Training the model

In [13]:
from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(x, y, 64)

In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.fit(train_gen, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 5.56594, saving model to next_words.h5
Epoch 2/70
Epoch 2: loss improved from 5.56594 to 5.30275, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 5.30275 to 5.10840, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 5.10840 to 4.93795, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 4.93795 to 4.78085, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 4.78085 to 4.62581, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 4.62581 to 4.46958, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.46958 to 4.31160, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.31160 to 4.14230, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.14230 to 3.96113, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 3.96113 to 3.76859, saving model to next_words.h5
Epoch 12/70
Epoch 12:

Epoch 36: loss improved from 0.63420 to 0.62393, saving model to next_words.h5
Epoch 37/70
Epoch 37: loss improved from 0.62393 to 0.60806, saving model to next_words.h5
Epoch 38/70
Epoch 38: loss improved from 0.60806 to 0.59663, saving model to next_words.h5
Epoch 39/70
Epoch 39: loss improved from 0.59663 to 0.58548, saving model to next_words.h5
Epoch 40/70
Epoch 40: loss improved from 0.58548 to 0.57634, saving model to next_words.h5
Epoch 41/70
Epoch 41: loss improved from 0.57634 to 0.56458, saving model to next_words.h5
Epoch 42/70
Epoch 42: loss improved from 0.56458 to 0.55654, saving model to next_words.h5
Epoch 43/70
Epoch 43: loss improved from 0.55654 to 0.54993, saving model to next_words.h5
Epoch 44/70
Epoch 44: loss improved from 0.54993 to 0.54046, saving model to next_words.h5
Epoch 45/70
Epoch 45: loss improved from 0.54046 to 0.53486, saving model to next_words.h5
Epoch 46/70
Epoch 46: loss improved from 0.53486 to 0.52653, saving model to next_words.h5
Epoch 47/70

<keras.callbacks.History at 0x20caf5f5c60>

##### Predict using model

In [21]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

### Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def predict_next_words(model, tokenizer, text):
    
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ""
    
    for key, value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
    
    print(predicted_word)
    return(predicted_word)


In [22]:
while(True):
    text = input("Enter your line : ")
    
    if text == "0":
        print("Execution Completed.....")
        break
        
    else :
        try:
            text = text.split(" ")
            text = text[-3:]
            print(text)
            
            predict_next_words(model, tokenizer, text)
            
        except Exception as e:
            print(f"Error occurred : {e}")
            continue

Enter your line : The Project Gutenberg
['The', 'Project', 'Gutenberg']
literary
Enter your line : The Project Gutenberg eBook of
['Gutenberg', 'eBook', 'of']
pride
Enter your line :  Lizzy is not a bit better than
['bit', 'better', 'than']
the


KeyboardInterrupt: Interrupted by user