In [1]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
import pickle

Using TensorFlow backend.


In [2]:
#reading processed data
data = open('cleandata.csv').read()[:100000]

#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences([data])[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for line in data.split('.'):
        encoded = tokenizer.texts_to_sequences([line])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    print(sequences[0])
    X, y = sequences[:,:-1],sequences[:,-1]
    
    print(sequences)
    print("X : ",X)
    print("Y: ",y)
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [3]:
#returning forward and reverse sequences along with max sequence 
#length from the data 

X,y,rev_X,rev_y,max_length,vocab_size = data_sequencing(data)

Vocabulary Size: 3777
Total Sequences: 15197
Max Sequence Length: 50
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0 193   3]
[[   0    0    0 ...    0  193    3]
 [   0    0    0 ...  193    3  360]
 [   0    0    0 ...    3  360   19]
 ...
 [   0    0    0 ... 3776   48   26]
 [   0    0    0 ...   48   26 1026]
 [   0    0    0 ...   26 1026  607]]
X :  [[   0    0    0 ...    0    0  193]
 [   0    0    0 ...    0  193    3]
 [   0    0    0 ...  193    3  360]
 ...
 [   0    0    0 ...    2 3776   48]
 [   0    0    0 ... 3776   48   26]
 [   0    0    0 ...   48   26 1026]]
Y:  [   3  360   19 ...   26 1026  607]


In [None]:
# define forward sequence model
model = Sequential()
model.add(Embedding(vocab_size,100, input_length=max_length-1))
#model.add(LSTM(100))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 100, input_length=max_length-1))
#rev_model.add(LSTM(100))
rev_model.add(Bidirectional(LSTM(100)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())


In [None]:
# compile forward sequence network
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y,batch_size=100, epochs=100, verbose=2)
# save the model to file
model.save('model.h5')

In [None]:
# compile reverse sequence network
rev_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
rev_model.fit(rev_X, rev_y,batch_size=100, epochs=100, verbose=2)
# save the model to file
rev_model.save('rev_model.h5')

In [2]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words

In [3]:
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

In [4]:
# load the model
model = load_model('model.h5')
rev_model = load_model('rev_model.h5')

#load tokeniser and max_length
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)
    
import en_core_web_md
nlp = en_core_web_md.load()

In [5]:
#Find and set embeddings for OOV words
def set_embedding_for_oov(doc):
    #checking for oov words and adding embedding
    for token in doc:
        if token.is_oov == True:
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')
            print("Before Text: ",before_text)
            print("After text: ",after_text )

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))

In [6]:
doc = nlp('i livwgffe in london and')
set_embedding_for_oov(doc)

Before Text:  i
After text:  and london in


InternalError: Blas GEMM launch failed : a.shape=(1, 100), b.shape=(100, 100), m=1, n=100, k=100
	 [[Node: bidirectional_1/while_1/MatMul_6 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](bidirectional_1/while_1/Switch_3:1, bidirectional_1/while_1/MatMul_6/Enter)]]
	 [[Node: dense_1/Softmax/_587 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_472_dense_1/Softmax", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
tokenizer.texts_to_sequences(['and'])[0]

In [18]:
from keras import backend as K
inp = model.input                                           # input placeholder

encoded = tokenizer.texts_to_sequences(['and'])[0]
outputs = [layer.output for layer in model.layers]          # all layer outputs
functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs]    # evaluation functions

# Testing
test =pad_sequences([encoded], maxlen=max_length, padding='pre')
layer_outs = [func([test, 1.]) for func in functors]
print (layer_outs)

[[array([[[ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
          0.10639041,  0.30900633],
        [ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
          0.10639041,  0.30900633],
        [ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
          0.10639041,  0.30900633],
        ...,
        [ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
          0.10639041,  0.30900633],
        [ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
          0.10639041,  0.30900633],
        [-0.1914773 ,  0.13585521,  0.13554917, ...,  0.06510769,
         -0.1267743 ,  0.11291379]]], dtype=float32)], [array([[-0.0000000e+00, -9.8321718e-01, -1.0000000e+00, -7.6140159e-01,
        -9.2983049e-01,  5.4122710e-01, -8.7296522e-01, -9.8710352e-01,
        -4.4785690e-01, -0.0000000e+00,  0.0000000e+00,  9.9977386e-01,
        -9.9959952e-01, -9.9994224e-01, -0.0000000e+00,  0.0000000e+00,
        -0.0000000e+00, -0.0000000e+00, -0.0000000e+00, -7.528636

In [24]:
from keras import backend as K

# with a Sequential model
get_3rd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[0].output])
layer_output = get_3rd_layer_output([test])[0]

In [25]:
print(layer_output)

[[[ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
    0.30900633]
  [ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
    0.30900633]
  [ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
    0.30900633]
  ...
  [ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
    0.30900633]
  [ 0.03229091  0.14570677  0.10186311 ... -0.3778659   0.10639041
    0.30900633]
  [-0.1914773   0.13585521  0.13554917 ...  0.06510769 -0.1267743
    0.11291379]]]


In [21]:
print(layer_output.shape)

(1, 50, 100)


In [29]:
for layer in model.layers:
    weights = layer.get_weights() # list of numpy arrays
    print(weights)
    break

[array([[ 0.03229091,  0.14570677,  0.10186311, ..., -0.3778659 ,
         0.10639041,  0.30900633],
       [ 0.4312459 ,  0.8094974 ,  0.5639969 , ..., -0.3482559 ,
         0.66176736,  0.34343928],
       [ 0.0354765 ,  0.14547107, -0.08885228, ..., -0.04287698,
         0.98114103,  1.0777733 ],
       ...,
       [ 0.29039997,  0.4459584 ,  0.07219367, ...,  0.38600206,
        -0.08380733, -0.27997503],
       [ 0.34965372,  0.16146661,  0.3918662 , ...,  0.16594979,
        -0.10325196, -0.11696597],
       [-0.35282457, -0.52569634,  0.10266012, ...,  0.16191858,
         0.11232043, -0.19642301]], dtype=float32)]


In [1]:
we = model.layers[0].get_weights()
print(we[0])

NameError: name 'model' is not defined