Import Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np

Importing and Cleaning Dataset

In [2]:
file = open("1661-0.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Adventures of Sherlock Holmes Author: Arthur Conan Doyle Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019 Language: English Character set en"

In [3]:
len(data)

573660

Tokenizing Text

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[142, 4680, 1, 986, 5, 125, 33, 46, 556, 2164, 2165, 27, 987, 14, 22]

In [5]:
len(sequence_data)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8624


Breaking Sequence into 4grams Such as Trigrams are Fed is Input data for Training

In [6]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]


The Length of sequences are:  108955


array([[ 142, 4680,    1,  986],
       [4680,    1,  986,    5],
       [   1,  986,    5,  125],
       [ 986,    5,  125,   33],
       [   5,  125,   33,   46],
       [ 125,   33,   46,  556],
       [  33,   46,  556, 2164],
       [  46,  556, 2164, 2165],
       [ 556, 2164, 2165,   27],
       [2164, 2165,   27,  987]])

Splitting data into Input and Target Data

In [7]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [8]:

print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 142 4680    1]
 [4680    1  986]
 [   1  986    5]
 [ 986    5  125]
 [   5  125   33]
 [ 125   33   46]
 [  33   46  556]
 [  46  556 2164]
 [ 556 2164 2165]
 [2164 2165   27]]
Response:  [ 986    5  125   33   46  556 2164 2165   27  987]


In [9]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Creting LSTM Model


In [10]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             86240     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 8624)              8632624   
                                                                 
Total params: 21,767,864
Trainable params: 21,767,864
Non-trainable params: 0
_________________________________________________________________


Traing Model Suing Checkpoints #Code Commented Because Model has been trained has saved.

In [12]:
'''
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])
'''

'\nfrom tensorflow.keras.callbacks import ModelCheckpoint\n\ncheckpoint = ModelCheckpoint("next_words.h5", monitor=\'loss\', verbose=1, save_best_only=True)\nmodel.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))\nmodel.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])\n'

Importing Model and Creating Function to get Predictions

In [13]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  #print(predicted_word)
  return predicted_word

In [14]:
# WHile loop to run model and get predictions Without UI
'''
while(True):
  text = input("Enter your line: ")
  
  if text == "quit":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue
    '''

'\nwhile(True):\n  text = input("Enter your line: ")\n  \n  if text == "quit":\n      print("Execution completed.....")\n      break\n  \n  else:\n      try:\n          text = text.split(" ")\n          text = text[-3:]\n          print(text)\n        \n          Predict_Next_Words(model, tokenizer, text)\n          \n      except Exception as e:\n        print("Error occurred: ",e)\n        continue\n    '

Creating UI

In [15]:

    
import tkinter as tk

root= tk.Tk()

canvas1 = tk.Canvas(root, width = 400, height = 300,  relief = 'raised')
canvas1.pack()

def callback(sv):
    t=sv.get()
    t = t.split(" ")
    if(len(t)>3):
        getpredictions()



sv = tk.StringVar()

sv.trace("w", lambda name, index, mode, sv=sv: callback(sv))

label1 = tk.Label(root, text='Typing Assistant')
label1.config(font=('helvetica', 14))
canvas1.create_window(200, 25, window=label1)

label2 = tk.Label(root, text='Start Typing:')
label2.config(font=('helvetica', 10))
canvas1.create_window(200, 100, window=label2)

entry1 = tk.Entry(root, textvariable=sv) 
canvas1.create_window(200, 140, window=entry1)
entry1.focus()

label4 = tk.Label(root, text='Predicted Word:')
label4.config(font=('helvetica', 10))
canvas1.create_window(200, 190, window=label4)


def getpredictions():
  
    text = sv.get()
    text = text.split(" ")

    if(len(text)>=3):
        text = text[-3:]
        
        output = Predict_Next_Words(model, tokenizer, text)
        label3 = tk.Label(root, text= str(output),font=('helvetica', 10, 'bold'))
        canvas1.create_window(200, 230, window=label3)
    
        canvas1.create_window(200, 230, window=label3)

    else:
        label3 = tk.Label(root, text="Text Too Short",font=('helvetica', 10, 'bold'))
        canvas1.create_window(200, 230, window=label3)
        canvas1.create_window(200, 230, window=label3)

#Ignore this part
'''
button1 = tk.Button(root,text='Predict Next Word', command=getpredictions, bg='brown', fg='white', font=('helvetica', 9, 'bold'))
canvas1.create_window(200, 180, window=button1)
'''


root.mainloop()

Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\hp\AppData\Local\Programs\Python\Python310\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_15156\4134221737.py", line 18, in <lambda>
    sv.trace("w", lambda name, index, mode, sv=sv: callback(sv))
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_15156\4134221737.py", line 12, in callback
    getpredictions()
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_15156\4134221737.py", line 45, in getpredictions
    output = Predict_Next_Words(model, tokenizer, text)
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_15156\1953996038.py", line 13, in Predict_Next_Words
    preds = np.argmax(model.predict(sequence))
  File "C:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\hp\AppData\Local\Prog