In [1]:
!pip install nltk



In [2]:
import nltk

In [3]:
#nltk.download()

In [4]:
from nltk.corpus import wordnet
import os
import numpy as np
import pandas as pd

In [5]:
#directories
curr_dir = os.getcwd()
target_words_dir = 'Targets.txt'
target_txt = open(os.path.join(curr_dir,target_words_dir),"r")

def get_target_words(file_dir):
    target_txt = open(file_dir,"r")
    words = target_txt.readlines()
    return([word[:-1].lower() for word in words])

def get_dataframe(target_words):
    data = {"words":[],"description":[]}
    for word in target_words:
        try:
            synset = wordnet.synsets(word)         
            data["description"].append(synset[0].definition())
            data["words"].append(word)
        except:pass
    return pd.DataFrame(data)

In [6]:
target_words = np.array(get_target_words(target_words_dir))
data = get_dataframe(target_words)

In [7]:
data.head(4)

Unnamed: 0,words,description
0,kitty,the combined stakes of the betters
1,lookup,an operation that determines whether one or mo...
2,otiose,serving no useful purpose; having no excuse fo...
3,gallant,a man who is much concerned with his dress and...


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

import tensorflow as tf

from nltk.corpus import stopwords  

# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.callbacks import EarlyStopping

# import tensorflow as tf

# from nltk.corpus import stopwords 

In [9]:
class PreProcess:
    tokenizer = Tokenizer(oov_token="<OOV>")
    max_sequence_len = None
    def __init__(self,corpus):
        self.corpus = self.remove_stop_words(corpus)
        self.tokenizer.fit_on_texts(corpus)
        
    def remove_stop_words(self,data):
        stop_words = set(stopwords.words('english'))
        for i in range(len(data)):
            sent_lst = list(data[i].split(" "))
            sent_lst = filter(lambda x: x not in stop_words, sent_lst)
            data[i] = ' '.join([str(wrd) for wrd in sent_lst])
        return data
            
    def tokenize(self,data):
        data = self.remove_stop_words(data)
        seq = self.tokenizer.texts_to_sequences(data)
        if(self.max_sequence_len == None):self.max_sequence_len = max([len(x) for x in seq])        
        seq_padded = pad_sequences(seq,maxlen=self.max_sequence_len)
        return(seq_padded)
    
    def get_word_index(self):
        return(self.tokenizer.word_index)
    
    def get_seq_length(self):
        return self.max_sequence_len
    
    def get_vocab_length(self):
        return len(self.tokenizer.word_index) + 1
    

In [10]:
pre_pro_X = PreProcess(data['description'])
X = np.array(pre_pro_X.tokenize(data['description']))

max_seq_length = pre_pro_X.get_seq_length()
vocab_length = pre_pro_X.get_vocab_length()
print(max_seq_length)

36


In [11]:
pre_pro_y = PreProcess(data['words'])
y = np.array(pre_pro_y.tokenize(data['words']))

total_target_words = pre_pro_y.get_vocab_length()
ys = tf.keras.utils.to_categorical(y, num_classes=total_target_words)


In [12]:
#print(ys.shape)

In [13]:
#ys

In [14]:
model = Sequential()

model.add(Embedding(vocab_length, 100, input_length=36))

model.add(Bidirectional(LSTM(256)))

model.add(Dense(total_target_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer="ADAM",metrics=['accuracy'])

In [None]:
earlystop = EarlyStopping(monitor='accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
history = model.fit(X, ys, epochs=5, verbose=1,callbacks=[earlystop])

Epoch 1/5

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

In [None]:
plot_graphs(history, 'loss')

In [None]:
plot_graphs(history,'accuracy')

In [None]:
#test_desc_encoded

In [None]:
#predicted = np.argmax(model.predict(test_desc_encoded), axis=-1)
# wrd = 'e x _ m p _ a _'
# test_description = "something to be imitated"

# test_desc_encoded = pre_pro_X.tokenize([test_description])
# preds = list(model.predict(test_desc_encoded).reshape(-1))

In [None]:
class Predict:
    
    final_preds = {"Masked":[],"preds":[]}
    
    def __init__(self,target_words,model,pre_pro_X,pre_pro_y):
        self.target_words =  target_words
        self.model        =  model
        self.pre_pro_X    =  pre_pro_X
        self.pre_pro_Y    =  pre_pro_Y
        
    def get_words_set(pred_word,dashed,wrd_len):
        words_set = filter(lambda word : nltk.edit_distance(word, pred_word) ==  dashed and wrd_len ==  len(word),
                          self.target_words)
        return words_set
        
    def predict_word(wrd,preds):
        wrd_lst = wrd.split(" ")
        wrd_conv = ''.join(x for x in filter(lambda x: x!='_', wrd_lst))
        
        words_set = self.get_words_set(wrd_conv,wrd_lst.count('_'),len(wrd_lst))
        
        target_wrds_set = list(self.pre_pro_y.tokenizer.word_index.keys())
        target_wrds_inxs, selective_preds = [],[]

        for word in words_set:
            try:target_wrds_inxs.append(word)
            except:pass    

        for inx in target_wrds_inxs:
            try:selective_preds.append(preds[target_wrds_set.index(inx)])
            except:pass

        max_prob = max(selective_preds)
        return(target_wrds_set[preds.index(max_prob)])
                
    
    def predict_words(words_dataframe):
        
        for index,row in words_dataframe:
            self.final_preds["Masked"].append(row["Masked"])
            
            test_desc_encoded = self.pre_pro_X.tokenize([row['Meaning']])
            preds = list(self.model.predict(test_desc_encoded).reshape(-1))
            
            self.final_preds["preds"].append(self.predict_word(row["Masked"]))
                   

                
predict = Predict(target_words, model, pre_pro_X, pre_pro_y) 
print(predict.predict_words(pd.DataFrame({"Masked":['e x _ m p _ a _'],"Meaning":["something to be imitated"]})))
                
# def get_words_set(pred_word,dashed,wrd_len):
#     words_set  = []
#     for word in target_words:
#         if(nltk.edit_distance(word, pred_word)== dashed and wrd_len ==  len(word)):
#             words_set.append(word)
#     return(words_set)

# def predict_words(wrd, preds):
#     wrd_lst = wrd.split(' ')
#     wrd_len = len(wrd_lst)
#     wrd_lst= filter(lambda x: x!='_',wrd_lst)
#     wrd_cnv = ''.join(x for x in wrd_lst)
    
    
# #     for wrd in wrd_lst:
# #         if wrd!='_':wrd_conv = wrd_conv+wrd
        
#     words_set = get_words_set(wrd_conv,wrd_lst.count('_'),wrd_len)
#     target_wrds_set = list(pre_pro_y.tokenizer.word_index.keys())
#     target_wrds_inxs, selective_preds = [],[]
    
#     for word in words_set:
#         try:target_wrds_inxs.append(word)
#         except:pass    
        
#     for inx in target_wrds_inxs:
#         try:selective_preds.append(preds[target_wrds_set.index(inx)])
#         except:pass

#     max_prob = max(selective_preds)
#     return(target_wrds_set[preds.index(max_prob)])
   
#print(predict_words(wrd,preds))


In [None]:
wrd = 'g _ m'
test_description = "a herd of whales"

test_desc_encoded = pre_pro_X.tokenize([test_description])
preds = list(model.predict(test_desc_encoded).reshape(-1))