In [1]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
import pickle

Using TensorFlow backend.


In [2]:
#reading processed data
data = open('cleandata.csv').read()[:100000]

#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences([data])[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for line in data.split('.'):
        encoded = tokenizer.texts_to_sequences([line])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    print(sequences[0])
    X, y = sequences[:,:-1],sequences[:,-1]
    
    print(sequences)
    print("X : ",X)
    print("Y: ",y)
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [3]:
#returning forward and reverse sequences along with max sequence 
#length from the data 

X,y,rev_X,rev_y,max_length,vocab_size = data_sequencing(data)

Vocabulary Size: 3777
Total Sequences: 15197
Max Sequence Length: 50
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0 193   3]
[[   0    0    0 ...    0  193    3]
 [   0    0    0 ...  193    3  360]
 [   0    0    0 ...    3  360   19]
 ...
 [   0    0    0 ... 3776   48   26]
 [   0    0    0 ...   48   26 1026]
 [   0    0    0 ...   26 1026  607]]
X :  [[   0    0    0 ...    0    0  193]
 [   0    0    0 ...    0  193    3]
 [   0    0    0 ...  193    3  360]
 ...
 [   0    0    0 ...    2 3776   48]
 [   0    0    0 ... 3776   48   26]
 [   0    0    0 ...   48   26 1026]]
Y:  [   3  360   19 ...   26 1026  607]


In [4]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''


            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stoplist:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break
            
            reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
                        
            '''for k in range(n_preds):
                out_word = reverse_word_map[yhat[k]]
                if out_word not in stopwords:
                    pred_words += ' '+ out_word'''


        return pred_words

In [5]:
'''for a,b in tokenizer.word_index.items():
    print("Word: ",a)
    print("inde: ",b)'''
dic = tokenizer.word_index.items()
print(dic)
print(tokenizer.texts_to_sequences(['indians'])[0][0])
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
print(reverse_word_map[3772]+"ss")

NameError: name 'tokenizer' is not defined

In [6]:
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

In [7]:
# load the model
model = load_model('model.h5')
rev_model = load_model('rev_model.h5')

#load tokeniser and max_length
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)

In [8]:
#Find and set embeddings for OOV words
'''
def set_embedding_for_oov(doc):
    #checking for oov words and adding embedding
    for token in doc:
        if token.is_oov == True:
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))
'''

def set_embedding_for_oov(doc,i):
    #checking for oov words and adding embedding
    d = doc.split()
    c=0
    s = ""
    for w in d:
        c+=1
        s = s+" "+w
        if c==i:
            break
    s.strip()
    before_text = s
    c=0
    s = ""
    for w in d:
        if c>i:
            s = s+" "+w
        c+=1
    s.strip()
    after_text = s
    pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
    pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
    embedding = numpy.zeros((1,100))
    
    we = model.layers[0].get_weights()[0]
    
    i=len(before_text)
    print('Words predicted from forward sequence model:')
    for word in pred_before:
        print(word)
        
        embedding += i*(we[tokenizer.texts_to_sequences([word])[0]])
        i= i*.5
    i=len(after_text)
    print('Words predicted from reverse sequence model:')
    for word in pred_after:
        print(word)
        embedding += i*(we[tokenizer.texts_to_sequences([word])[0]])
        i= i*.5
    #print("livwgffe")
    print(we)
    return embedding

In [9]:
tokenizer.texts_to_sequences(['Arunachal'])[0]

[]

In [10]:
doc = 'two days of rescue operations in Arunachal'
wvec = set_embedding_for_oov(doc,6)

Words predicted from forward sequence model:
iraq
several
baghdad
northern
Words predicted from reverse sequence model:
[[ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
   0.30900633]
 [ 0.4312459   0.8094974   0.5639969  ... -0.3482559   0.66176736
   0.34343928]
 [ 0.0354765   0.14547107 -0.08885228 ... -0.04287698  0.98114103
   1.0777733 ]
 ...
 [ 0.29039997  0.4459584   0.07219367 ...  0.38600206 -0.08380733
  -0.27997503]
 [ 0.34965372  0.16146661  0.3918662  ...  0.16594979 -0.10325196
  -0.11696597]
 [-0.35282457 -0.52569634  0.10266012 ...  0.16191858  0.11232043
  -0.19642301]]


In [11]:
print(len(wvec[0]))
wvec[0]

100


array([-6.61326876e+00, -2.89705232e+00, -5.33135384e+00, -7.65619746e+00,
       -3.02188966e+00, -3.13161789e+00,  1.89576739e+00, -3.42928363e+00,
       -7.71816780e+00,  3.26518655e-01,  1.77042051e+00, -1.93851203e+00,
       -8.17610276e+00,  2.16638649e+00, -3.70512629e+00,  8.53973867e+00,
       -5.59404346e+00, -4.87772787e+00,  3.18195014e+00,  6.26531994e+00,
       -1.07281300e+01,  3.77526876e+00, -2.58198324e+00, -6.57297283e+00,
       -1.49671459e+00, -3.54070982e+00, -7.26853061e+00,  4.29179969e+00,
        1.69239494e+00, -2.70131749e+00, -7.05439737e+00,  3.44780352e-01,
       -1.27601066e+00,  6.81959927e-01,  1.63765249e+00, -8.04486176e+00,
        2.65309399e+00, -3.06206321e+00, -8.30527544e-02, -6.98175612e+00,
        3.75216502e+00,  4.93146596e+00,  8.65362599e-01, -9.76684541e+00,
        3.94453585e-01, -2.23768157e+00, -2.87805526e+00, -1.00782246e+01,
        1.31710065e+01,  6.60505703e+00,  2.42221969e+00,  3.07164747e+00,
       -3.24543214e+00, -

In [12]:
we = model.layers[0].get_weights()[0]
liv = we[tokenizer.texts_to_sequences(['technology'])[0]]
print(liv[0])

[ 0.5689838   0.02723964  0.01090907  0.20310909  0.19987588 -0.11576185
  0.17171475  0.33351701 -0.23342384  0.08334058  0.22699146  0.32254574
  0.13263938  0.21751913  0.05855879  0.05582382 -0.2082214   0.0358717
  0.1329843   0.34725454 -0.13797463  0.16414174  0.26986384 -0.09112421
 -0.1609759   0.10938127 -0.28091666  0.21859422 -0.09542681 -0.2855383
 -0.5478957   0.12655789 -0.08379794  0.05684386 -0.16860718 -0.32867682
 -0.04619893  0.18602961  0.15340003 -0.7332512  -0.10149116 -0.06712315
  0.25863004 -0.05319246 -0.05742927 -0.05189406 -0.11470417  0.07807527
 -0.07891811 -0.20015687  0.19457817 -0.09958789  0.00175214  0.05241703
  0.26727694 -0.08773102  0.0944387  -0.02738621  0.24147841  0.14021084
 -0.10186103  0.01337525 -0.159075    0.41717976  0.14805643  0.19271347
  0.11069132  0.2028703  -0.07766058 -0.17397748  0.18074742 -0.03884061
 -0.00242406 -0.03615341 -0.1070043   0.01253707 -0.07937782 -0.37821183
  0.1932557   0.39728084 -0.23277296  0.13214952 -0.0

In [13]:
import tensorflow as tf
def cos_simm(c,d):
    a = tf.placeholder(tf.float32, shape=[None], name="input_placeholder_a")
    b = tf.placeholder(tf.float32, shape=[None], name="input_placeholder_b")
    normalize_a = tf.nn.l2_normalize(a,0)        
    normalize_b = tf.nn.l2_normalize(b,0)
    cos_similarity=tf.reduce_sum(tf.multiply(normalize_a,normalize_b))
    sess=tf.Session()
    cos_sim=sess.run(cos_similarity,feed_dict={a:c,b:d})
    print(cos_sim)
    
cos_simm(liv[0],wvec[0])

0.24750036


In [14]:
tokenizer.word_index.items()



In [15]:
def most_sim(vec):
    v = {}
    for w,i in tokenizer.word_index.items():
        tvec = we[tokenizer.texts_to_sequences([w])[0]]
        v[w] = cos_simm(tvec[0],vec[0])
    return v
s=most_sim(wvec)
print(s)


-0.20746462
-0.30095342
-0.20297293
-0.38978025
-0.24928938
0.040446628
-0.25881615
-0.1396482
-0.03917646
-0.10063427
-0.14195502
-0.1288736
-0.35449576
0.111577496
-0.1284844
-0.21637127


KeyboardInterrupt: 

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_sim(x,y):
    return cosine_similarity(x,y)

In [17]:
a = cosine_sim(we,wvec)
len(a)

3777

In [18]:
def sim_dict(z,a):
    d={}
    
    for w,i in tokenizer.word_index.items():
        d[w] = list(a[i])[0]
    
    return d

bag = sim_dict(1,a)

In [19]:
bag = sorted(bag.items(), key=lambda x:x[1],reverse = True)

In [20]:
bag

[('iraq', 0.8222413257714514),
 ('different', 0.5225884355796095),
 ('areas', 0.4927706300464644),
 ('off', 0.4741870487293678),
 ('schools', 0.4644263445524407),
 ('argentina', 0.4639873123335596),
 ('violations', 0.4526879405186852),
 ('own', 0.44353494322790044),
 ('krona', 0.441511891276929),
 ('niger', 0.4351233695005512),
 ('amir', 0.4338025929007191),
 ('suffering', 0.42965546637095886),
 ('hague', 0.42830126805822055),
 ('count', 0.42634470311888395),
 ('travelers', 0.42441422638970006),
 ('baghdad', 0.4234930935121724),
 ('contractors', 0.42257826495581685),
 ('broadcast', 0.42020005963175133),
 ('ups', 0.41686175632035083),
 ('typhoon', 0.4148723245834399),
 ('islands', 0.4132938542638227),
 ('rebuild', 0.4127795668135556),
 ('african', 0.4114407769171916),
 ('howard', 0.4082873522800168),
 ('zapatero', 0.4056390775161166),
 ('employment', 0.40476903329638225),
 ('nias', 0.4043767570501567),
 ('bus', 0.4036762980659195),
 ('delhi', 0.4033726863247523),
 ('chavez', 0.401027056

In [21]:
i=0
for word,sim in bag:
    if word not in stoplist:
        print(word)
        i+=1
        if i == 10:
            break

iraq
different
areas
schools
argentina
violations
krona
niger
amir
suffering


In [22]:
import pandas as pd
import numpy as np

data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [35]:
words = list(set(data["Word"].values))
for w in words:
    if w=='different':
        print("MILA")
        break

MILA


In [47]:
s = data["Word"]
i = s[s=="different"].index
i[0]
data["Tag"].iloc[i[0]]

'O'

In [None]:
d = 'i livwgffe in london'
d = d.split()
s = ""
c=0
i=1
for w in d:
    if c>i:
        s = s+" "+w
    c+=1
s.strip()

In [86]:
tokenizer.texts_to_sequences(['are'])[0]

[27]

In [89]:
doc = 'Mark and John are working at Google bad boy'
wvec = set_embedding_for_oov(doc,6)

Words predicted from forward sequence model:
Words predicted from reverse sequence model:
another
following
[[ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
   0.30900633]
 [ 0.4312459   0.8094974   0.5639969  ... -0.3482559   0.66176736
   0.34343928]
 [ 0.0354765   0.14547107 -0.08885228 ... -0.04287698  0.98114103
   1.0777733 ]
 ...
 [ 0.29039997  0.4459584   0.07219367 ...  0.38600206 -0.08380733
  -0.27997503]
 [ 0.34965372  0.16146661  0.3918662  ...  0.16594979 -0.10325196
  -0.11696597]
 [-0.35282457 -0.52569634  0.10266012 ...  0.16191858  0.11232043
  -0.19642301]]


In [90]:
wvec

array([[-0.93312046,  1.25591677,  1.14888746, -1.36455841, -0.64644194,
        -3.35248869,  3.08508551, -0.63104972,  3.13074595,  1.95684135,
         1.36409903, -2.03552932, -0.49231052, -0.82158899,  1.72321349,
        -0.35279012, -2.44306598, -3.05517328,  1.22151452,  2.04921889,
        -2.72322463, -3.26286274,  0.26696765, -0.7281813 , -0.94488814,
        -2.59427559, -0.45956957, -1.52193809, -3.3967334 ,  0.16014626,
         1.2755032 ,  4.19891286, -4.63103199,  0.47993919, -0.95677957,
         6.49070716, -2.25257203, -0.24924088, -2.35969353, -2.19521174,
        -2.83930156, -1.54602301,  5.15664601,  0.24498057,  0.31249332,
         0.54291672, -3.10381541,  1.78369281, -3.67011535, -2.71680206,
         2.20129994, -3.49429083,  0.69243658, -2.17300415,  1.21716285,
        -1.20140634, -1.39264148, -1.24009925, -0.23012586,  0.56739712,
        -3.5583716 ,  2.1598528 ,  0.05089235,  2.95047617, -1.10499102,
         0.73589671, -2.06816864,  3.27942886,  1.4

In [94]:
we = model.layers[0].get_weights()[0]
a = cosine_sim(we,wvec)
bag = sim_dict(1,a)
bag = sorted(bag.items(), key=lambda x:x[1],reverse = True)
i=0
for word,sim in bag:
    if word not in stoplist:
        print(word)
        i+=1
        if i == 5:
            break

another
following
lay
specialist
zapatista


In [None]:
print(wvec.shape)