## Next Word Predictor

In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [3]:
file = open('A room with a view.txt','r',encoding = "utf8")

In [4]:
lines = []
for line in file:
    lines.append(line)
    
content = ""
for line in lines:
    content = ' '.join(lines)
    
content = content.replace('\n',' ').replace("\r",' ').replace('"',' ').replace("'",' ').replace("\ufeff",' ').replace(",",' ').replace(".",' ')

content = content.split()
content = " ".join(content)
content[:500]

'The Project Gutenberg eBook of A Room With A View by E M Forster This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever You may copy it give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www gutenberg org If you are not located in the United States you will have to check the laws of the country where you are located before using this eBo'

In [5]:
len(content)

382782

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([content])

# saving tokenizer for predict func
pickle.dump(tokenizer,open('token.pkl','wb'))

seq_data = tokenizer.texts_to_sequences([content])[0]
seq_data[:5]

[1, 113, 105, 594, 5]

In [7]:
# this is less than the len(content) because in content, there are repeated words, and tokenizer converts each unique words into one numeric representation
len(seq_data)

72076

In [8]:
# we have only this unique words
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8213


In [9]:
sequences = []
for i in range(3, len(seq_data)):
    words = seq_data[i-3:i+1]
    sequences.append(words)
sequences = np.array(sequences)
sequences[:5]

array([[  1, 113, 105, 594],
       [113, 105, 594,   5],
       [105, 594,   5,   6],
       [594,   5,   6, 114],
       [  5,   6, 114,  19]])

In [10]:
x = []
y = []

for i in sequences:
    x.append(i[:3])
    y.append(i[-1])
    
x = np.array(x)
y = np.array(y)

In [11]:
# converting class vectors into binary class matrix
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Creating the model

Defining neural network and creating our model

In [21]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=3))   # size of vocab in my textdata, o/p dimension(size of vector space in which words will embedded) 
model.add(LSTM(1000,return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation = "relu"))          # converts -ve values to 0 and do nothing with +ve
model.add(Dense(vocab_size, activation = "softmax")) # scales number into probabilities

In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             82130     
                                                                 
 lstm_2 (LSTM)               (None, 3, 1000)           4044000   
                                                                 
 lstm_3 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense_2 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_3 (Dense)             (None, 8213)              8221213   
                                                                 
Total params: 21,352,343
Trainable params: 21,352,343
Non-trainable params: 0
_________________________________________________________________


## Plot the model

In [23]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='plot_image.png', show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [24]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss = "categorical_crossentropy", optimizer=Adam(learning_rate = 0.001))
model.fit(x,y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.75077, saving model to next_words.h5
Epoch 2/70
Epoch 2: loss improved from 6.75077 to 6.22421, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 6.22421 to 5.81406, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 5.81406 to 5.48839, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 5.48839 to 5.21782, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 5.21782 to 4.97427, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 4.97427 to 4.74115, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.74115 to 4.49505, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.49505 to 4.23940, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.23940 to 3.96987, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 3.96987 to 3.70272, saving model to next_words.h5
Epoch 12/70
Epoch 12:

Epoch 36/70
Epoch 36: loss improved from 0.48412 to 0.46448, saving model to next_words.h5
Epoch 37/70
Epoch 37: loss improved from 0.46448 to 0.44699, saving model to next_words.h5
Epoch 38/70
Epoch 38: loss improved from 0.44699 to 0.44505, saving model to next_words.h5
Epoch 39/70
Epoch 39: loss improved from 0.44505 to 0.42491, saving model to next_words.h5
Epoch 40/70
Epoch 40: loss improved from 0.42491 to 0.41544, saving model to next_words.h5
Epoch 41/70
Epoch 41: loss improved from 0.41544 to 0.41035, saving model to next_words.h5
Epoch 42/70
Epoch 42: loss improved from 0.41035 to 0.39915, saving model to next_words.h5
Epoch 43/70
Epoch 43: loss improved from 0.39915 to 0.39555, saving model to next_words.h5
Epoch 44/70
Epoch 44: loss improved from 0.39555 to 0.38525, saving model to next_words.h5
Epoch 45/70
Epoch 45: loss improved from 0.38525 to 0.37632, saving model to next_words.h5
Epoch 46/70
Epoch 46: loss improved from 0.37632 to 0.36869, saving model to next_words.h5

<keras.callbacks.History at 0x265941b21f0>

## Prediction System

In [30]:
from tensorflow.keras.models import load_model

# load model and tokenizer which we have saved
model = load_model("next_words.h5")
tokenizer = pickle.load(open('token.pkl','rb'))

def predict_next_word(model,tokenizer,text):
    sequence_data = tokenizer.texts_to_sequences([text])
    sequence_data = np.array(sequence_data)
    prediction = np.argmax(model.predict(sequence_data))  # returns the indices of maximum value
    predicted_word = ""
    
    for key, value in tokenizer.word_index.items():
        if value == prediction:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word

In [31]:
while(True):
    text = input("Enter text: ")
    
    if(text=='0'):
        print("Execution completed!")
        break
    else:
        try:
            text = text.split(" ")
            text = text[-3:]
            print(text)
            predict_next_word(model,tokenizer,text)
            
        except Exception as e:
            print("Error occurred", e)
            continue
            

Enter text: License included with
['License', 'included', 'with']
this
Enter text: Laureate that hung behind the
['hung', 'behind', 'the']
english
Enter text: 0
Execution completed!
