Token Dictionary Creation

In [1]:
def build_dict():
  
  import requests
  
  # Corpus -> The Project Gutenberg EBook of The Art Of Writing & Speaking The English Language, by Sherwin Cody
  import_dataset=requests.get('http://www.gutenberg.org/files/19719/19719-0.txt')

  data = import_dataset.text.split('\n')
  data=" ".join(data)

  #Cleaning the text and separating each word into a list
  tokens = [w.translate(table) for w in data.split()]
  tokens = [word.lower() for word in tokens if word.isalpha()]

  #Length of training sequence one greater than input sequence
  length = seq_length + 1
  lines= []

  for i in range(length, len(tokens)+1):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    lines.append(line)

  import numpy as np
  from tensorflow.keras.preprocessing.text import Tokenizer

  tokenizer= Tokenizer()
  tokenizer.fit_on_texts(lines)
  sequences = tokenizer.texts_to_sequences(lines)

  token_dict=pd.DataFrame(tokenizer.word_index.values(),
                          tokenizer.word_index.keys(), ['value'])
  token_dict.to_csv('TD.csv')
  return token_dict

Model Creation

In [2]:
def built_model():
  
  from tensorflow.keras.utils import to_categorical
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Dense, LSTM, Embedding
  
  #Preparaton of dataset for training
  sequences = np.array(sequences)
  X, y = sequences[:, :-1], sequences[:,-1]
  vocab_size = len(tokenizer.word_index)+1
  y = to_categorical(y, num_classes=vocab_size)

  #Building the model
  model = Sequential()
  model.add(Embedding(vocab_size, 50, input_length=seq_length))
  model.add(LSTM(100, return_sequences = True))
  model.add(LSTM(100))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(vocab_size, activation='softmax')) 
  model.compile(loss = 'categorical_crossentropy',
                optimizer = 'adam', metrics = 'accuracy')

  #Training and Saving the model
  model.fit(X, y, batch_size=256, epochs=200)
  model.save('ATG.h5')
  return model

Loading Token Dictionary and Model

In [3]:
import numpy as np
import pandas as pd
import string
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tkinter import *

table=str.maketrans('', '', string.punctuation)
matrix=np.array([])

In [4]:
seq_length=3
try:
  model = load_model('ATG.h5')
except OSError:
  token_dict = build_dict()
  model = built_model()

try:
  token_dict = pd.read_csv('TD.csv',index_col=0)
except FileNotFoundError:
  token_dict = build_dict()

Prediction Function

In [5]:
def prediction(data):
  global matrix
  input_text=[w.translate(table) for w in data.split()]
  input_text=[w.lower() for w in input_text if w.isalpha()]
  encoded=[]
  for w in input_text:
    try:
      encoded.append(token_dict.value[w])
    except KeyError:
      None
  
  encoded = pad_sequences([encoded], maxlen = seq_length)
  matrix = model.predict(encoded)[0]
  
def next_word(input_text):
    global matrix
    prediction(input_text)
    predicted=[]
    matrix_copy=matrix.copy()
    for _ in range(5):
      if matrix_copy.max()<0.05:
        break
      y_maxp=matrix_copy.argmax()
      predicted.append(token_dict.index[y_maxp-1])
      matrix_copy[y_maxp]=0
    return predicted

In [6]:
def curr_word(input_text):
  try:
      word=input_text.split()[-1]
      global matrix
      word_list=[w for w in list(token_dict.index) if w.startswith(word)]
      token_list=[token_dict.value[w] for w in word_list]
      prob_list=np.array([matrix[w] for w in token_list])
      suggestion=[]
      for _ in range(np.array([3,len(word_list)]).min()):
        if prob_list.max()<0.05:
          break
        x=prob_list.argmax()
        suggestion.append(word_list[x])
        prob_list[x]=0
      return suggestion
  except IndexError:
        return []

GUI

In [7]:
def run_GUI():
    
  root=Tk()                                     
  root.geometry('900x300')                       
  root.config(bg='DodgerBlue4')
  root.title('Automatic Text Generation: Test-box')
  l1=Label(root,text='\nAutomatic Text Generation: Test-box \n  ',
           font='Candara 18 bold',bg='DodgerBlue4',fg='SlateGray1')
  l1.pack()
  l2=Label(root)
  global input_box
  input_box=Entry(root,width=75,font='Candara 15',bg='SlateGray1')
  input_box.pack()
  global past
  past=False
  global lb
  lb=Listbox()
  import pyperclip
  def copy():
    pyperclip.copy(input_box.get())
  Button(root,text='Copy',font='Candara 15',bg='DodgerBlue4',fg='SlateGray1',command=copy,relief='flat').pack(side=BOTTOM)
  
  def put(event):                                                               # Function to put selected word from...
    global input_box                                                            # predicted words list into entry bar
    input_text= str(input_box.get())
    last_word=input_text.split()[-1]
    cs = lb.curselection()    
    insert_text= str(lb.get(cs))
    if insert_text.startswith(last_word):
      insert_text=insert_text[len(last_word):]
    input_box.insert(END, insert_text + ' ')
    lb.destroy()
    suggestion = next_word(input_text+insert_text)
    if len(suggestion) >0:
        listing(suggestion)

  def listing(predicted):                                                       # Function to print list of predicted words
    global lb
    lb = Listbox(root,font='Candara 15',width=15,height=len(predicted),bg='SlateGray1')
    lb.bind('<Double-1>', put)
    lb.pack()
    for word in predicted:
      lb.insert(END,word)
    

  def keypress(e):
    global past
    global lb
    global l2
    suggestion=[]
    input_text=str(input_box.get())
    if past==True:
      lb.destroy()            
    pyperclip.copy(input_box.get())
    if e.char == ' ':                                                  # Prediction function called after <space>
      suggestion=next_word(input_text)
      past=True
    else:
      if past==True:
        suggestion=curr_word(input_text)
    
    if len(suggestion)>0:
      listing(suggestion)

  root.bind("<KeyPress>", keypress)
  root.mainloop()

In [8]:
root=Tk()
root.geometry('830x500')
root.config(bg='silver')
def close(e):
  root.destroy()
Label(root,text=' '*10,font='Candara 45 bold',bg='silver').grid(row=0)
Label(root,text=' '*10+'Project :',font='Candara 20 bold',bg='silver').grid(row=3,column=0,stick='w')
Label(root,text=' '*10+'Automatic Text Generation',font='Candara 20 bold',bg='silver').grid(row=4,column=0,stick='e')
Label(root,text=' '*10+'Developed By :',font='Candara 18 bold',fg='blue',bg='silver').grid(row=5,column=1,stick='sw')
Label(root,text='Vaibhav Jain (181B232)',font='Candara 17 italic',fg='blue',bg='silver').grid(row=6,column=1,stick='e')
Label(root,text='Vibhum Tripathi (181B237)',font='Candara 17 italic',fg='blue',bg='silver').grid(row=7,column=1,stick='e')
Label(root,text='Mohit Sharma (181B129)',font='Candara 17 italic',fg='blue',bg='silver').grid(row=8,column=1,stick='e')
Label(root,text=' '*10+'----'*16,font='Candara 17',fg='blue',bg='silver').grid(row=9,column=1)
Label(root,text=' '*10+'Project Guide :',font='Candara 18 bold',fg='blue',bg='silver').grid(row=10,column=1,stick='w')
Label(root,text='Dr. Ajay Kumar',font='Candara 17 italic',fg='blue',bg='silver').grid(row=11,column=1,stick='e')

root.bind('<Motion>',close)
root.mainloop()
 

run_GUI()