In [1]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
import pickle

Using TensorFlow backend.


In [2]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''


            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stoplist:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break
            
            reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
                        
            '''for k in range(n_preds):
                out_word = reverse_word_map[yhat[k]]
                if out_word not in stopwords:
                    pred_words += ' '+ out_word'''


        return pred_words

In [3]:
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

In [4]:
# load the model
model = load_model('model.h5')
rev_model = load_model('rev_model.h5')

#load tokeniser and max_length
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)

In [5]:
def set_embedding_for_oov(doc,i):
    #checking for oov words and adding embedding
    d = doc.split()
    c=0
    s = ""
    for w in d:
        c+=1
        s = s+" "+w
        if c==i:
            break
    s.strip()
    before_text = s
    c=0
    s = ""
    for w in d:
        if c>i:
            s = s+" "+w
        c+=1
    s.strip()
    after_text = s
    pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
    pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
    embedding = numpy.zeros((1,100))
    
    we = model.layers[0].get_weights()[0]
    
    i=len(before_text)
    print('Words predicted from forward sequence model:')
    for word in pred_before:
        print(word)
        
        embedding += i*(we[tokenizer.texts_to_sequences([word])[0]])
        i= i*.5
    i=len(after_text)
    print('Words predicted from reverse sequence model:')
    for word in pred_after:
        print(word)
        embedding += i*(we[tokenizer.texts_to_sequences([word])[0]])
        i= i*.5
    #print("livwgffe")
    #print(we)
    return embedding

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_sim(x,y):
    return cosine_similarity(x,y)

In [7]:
def set_score(wvec):
    we = model.layers[0].get_weights()[0]
    a = cosine_sim(we,wvec)
    d={}
    
    for w,i in tokenizer.word_index.items():
        d[w] = list(a[i])[0]
    bag = d
    bag = sorted(bag.items(), key=lambda x:x[1],reverse = True)
    i=0
    wl = []
    for word,sim in bag:
        if word not in stoplist:
            i+=1
            wl.append(word)
            if i == 5:
                break
                
    print("Similar words: ",wl)
    return wl
    

In [8]:
import pandas as pd

df = pd.read_csv('ner_dataset.csv', encoding = 'latin1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [9]:
def tag(l):
    percent = 1.0
    word = df["Word"].tolist()
    tag = df["Tag"].tolist()
    
    t = {}
    for w in l:
        c = 5
        try:
            if tag[word.index(w)] is not 'O':
                c = 15
            if tag[word.index(w)] in t.keys():
                p = t[tag[word.index(w)]]
                t[tag[word.index(w)]] = p + c*percent
            else:
                t[tag[word.index(w)]] = c*percent
            percent = percent - 0.1
        except:
            w = w.title()
            if tag[word.index(w)] is not 'O':
                c = 15
            if tag[word.index(w)] in t.keys():
                p = t[tag[word.index(w)]]
                t[tag[word.index(w)]] = p + c*percent
            else:
                t[tag[word.index(w)]] = c*percent
            percent = percent - 0.1
    
    print(t)
    
    tag = sorted(t.items(), key=lambda x:x[1],reverse = True)[0]
    print("Tag is : ",tag[0])

In [10]:
def find_tag(doc,i):
    wvec = set_embedding_for_oov(doc,i)
    wl = set_score(wvec)
    tag(wl)

In [11]:
doc = 'two days of rescue operations in Arunachal'
find_tag(doc,6)

Words predicted from forward sequence model:
iraq
several
baghdad
northern
Words predicted from reverse sequence model:
Similar words:  ['iraq', 'different', 'areas', 'schools', 'argentina']
{'B-geo': 15.0, 'O': 12.0, 'B-org': 9.000000000000002}
Tag is :  B-geo


In [12]:
doc = '26 children workers rescued from Parle-G plant in Chhattisgarh'
find_tag(doc,5)

Words predicted from forward sequence model:
palestinian
war
two
one
afghan
Words predicted from reverse sequence model:
deaths
prisoners
living
home
kidnapped
Similar words:  ['palestinian', 'deaths', 'living', 'security', 'service']
{'B-gpe': 15.0, 'O': 15.0}
Tag is :  B-gpe


In [15]:
doc = 'Some of the children allegedly employed at Apple factory hailed from Odisha and Jharkhand'
find_tag(doc,11)

Words predicted from forward sequence model:
indian
march
american
jerusalem
Words predicted from reverse sequence model:
biathlon
cities
bronchitis
darfur
election
Similar words:  ['indian', 'staged', 'biathlon', 'kill', 'foundations']
{'B-geo': 15.0, 'O': 15.0}
Tag is :  B-geo
