Import Required Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,LSTM,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

Upload File

In [2]:
from google.colab import files
uploaded=files.upload()

Saving next_word_predictor.txt to next_word_predictor.txt


In [3]:
file= open("next_word_predictor.txt",'r',encoding="utf-8")
lines=[]
for i in file:
  lines.append(i)
data =""
for i in lines:
  data =' '.join(lines)

data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

Pre-processing


In [4]:
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of Narrative of the Life of Frederick Douglass, by Frederick Douglass This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where '

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 68, 60, 392, 2, 346, 2, 1, 117, 2, 205, 206, 21, 205, 206]

In [6]:
len(sequence_data)

44357

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

5612


In [8]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  44354


array([[  1,  68,  60, 392],
       [ 68,  60, 392,   2],
       [ 60, 392,   2, 346],
       [392,   2, 346,   2],
       [  2, 346,   2,   1],
       [346,   2,   1, 117],
       [  2,   1, 117,   2],
       [  1, 117,   2, 205],
       [117,   2, 205, 206],
       [  2, 205, 206,  21]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[  1  68  60]
 [ 68  60 392]
 [ 60 392   2]
 [392   2 346]
 [  2 346   2]
 [346   2   1]
 [  2   1 117]
 [  1 117   2]
 [117   2 205]
 [  2 205 206]]
Response:  [392   2 346   2   1 117   2 205 206  21]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

LSTM Model Building

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 10)             56120     
_________________________________________________________________
lstm (LSTM)                  (None, 3, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 5612)              5617612   
Total params: 18,722,732
Trainable params: 18,722,732
Non-trainable params: 0
_________________________________________________________________


In [18]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001),metrics=['accuracy'])
model.fit(X, y, epochs=36, batch_size=64, callbacks=[checkpoint])

Epoch 1/36

Epoch 00001: loss improved from inf to 5.26267, saving model to next_words.h5
Epoch 2/36

Epoch 00002: loss improved from 5.26267 to 5.00355, saving model to next_words.h5
Epoch 3/36

Epoch 00003: loss improved from 5.00355 to 4.77862, saving model to next_words.h5
Epoch 4/36

Epoch 00004: loss improved from 4.77862 to 4.55288, saving model to next_words.h5
Epoch 5/36

Epoch 00005: loss improved from 4.55288 to 4.32623, saving model to next_words.h5
Epoch 6/36

Epoch 00006: loss improved from 4.32623 to 4.09941, saving model to next_words.h5
Epoch 7/36

Epoch 00007: loss improved from 4.09941 to 3.86624, saving model to next_words.h5
Epoch 8/36

Epoch 00008: loss improved from 3.86624 to 3.63405, saving model to next_words.h5
Epoch 9/36

Epoch 00009: loss improved from 3.63405 to 3.41106, saving model to next_words.h5
Epoch 10/36

Epoch 00010: loss improved from 3.41106 to 3.17800, saving model to next_words.h5
Epoch 11/36

Epoch 00011: loss improved from 3.17800 to 2.96179

<keras.callbacks.History at 0x7f81163a8750>

In [19]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word


Upcoming Word Generation

In [20]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue


Enter your line: who have been melted
['have', 'been', 'melted']
to
Enter your line: I shall never
['I', 'shall', 'never']
forget
Enter your line: 0
Execution completed.....
