In [1]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
import pickle

Using TensorFlow backend.


In [3]:
#reading processed data
data = open('cleandata.csv').read()[:100000]
data



In [7]:
#function for preparing text data into sequences for training 
def data_sequencing(data):   
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([data])
    with open('tokenizer.pkl', 'wb') as f: # Save the tokeniser by pickling it
        pickle.dump(tokenizer, f)

    encoded = tokenizer.texts_to_sequences([data])[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary Size: %d' % vocab_size)
    
    # create line-based sequences
    sequences = list()
    rev_sequences = list()
    for line in data.split('.'):
        encoded = tokenizer.texts_to_sequences([line])[0]
        rev_encoded = encoded[::-1]
        for i in range(1, len(encoded)):
            sequence = encoded[:i+1]
            rev_sequence = rev_encoded[:i+1]
            sequences.append(sequence)
            rev_sequences.append(rev_sequence)
    print('Total Sequences: %d' % len(sequences))
    
    
    #find max sequence length 
    max_length = max([len(seq) for seq in sequences])
    with open('max_length.pkl', 'wb') as f: # Save max_length by pickling it
        pickle.dump(max_length, f)
    print('Max Sequence Length: %d' % max_length)

    # pad sequences and create the forward sequence
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    sequences = array(sequences)
    X, y = sequences[:,:-1],sequences[:,-1]
    
    #pad sequences and create the reverse sequencing
    rev_sequences = pad_sequences(rev_sequences, maxlen=max_length, padding='pre')
    # split into input and output elements
    rev_sequences = array(rev_sequences)
    rev_X, rev_y = rev_sequences[:,:-1],rev_sequences[:,-1]

    return X,y,rev_X,rev_y,max_length,vocab_size

In [8]:
#returning forward and reverse sequences along with max sequence 
#length from the data 

X,y,rev_X,rev_y,max_length,vocab_size = data_sequencing(data)

Vocabulary Size: 3222
Total Sequences: 17310
Max Sequence Length: 35


In [9]:
# define forward sequence model
model = Sequential()
model.add(Embedding(vocab_size,100, input_length=max_length-1))
#model.add(LSTM(100))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 34, 100)           322200    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 3222)              647622    
Total params: 1,130,622
Trainable params: 1,130,622
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 100, input_length=max_length-1))
#rev_model.add(LSTM(100))
rev_model.add(Bidirectional(LSTM(100)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 34, 100)           322200    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_2 (Dense)              (None, 3222)              647622    
Total params: 1,130,622
Trainable params: 1,130,622
Non-trainable params: 0
_________________________________________________________________
None


In [98]:
# compile forward sequence network
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y,batch_size=100, epochs=200, verbose=2)
# save the model to file
model.save('model.h5')

Epoch 1/50
 - 47s - loss: 0.4698 - acc: 0.8634
Epoch 2/50
 - 45s - loss: 0.4530 - acc: 0.8641
Epoch 3/50
 - 43s - loss: 0.4526 - acc: 0.8616
Epoch 4/50
 - 43s - loss: 0.4480 - acc: 0.8631
Epoch 5/50
 - 42s - loss: 0.4513 - acc: 0.8615
Epoch 6/50
 - 42s - loss: 0.4483 - acc: 0.8643
Epoch 7/50
 - 56s - loss: 0.4473 - acc: 0.8632
Epoch 8/50
 - 51s - loss: 0.4473 - acc: 0.8645
Epoch 9/50
 - 50s - loss: 0.4468 - acc: 0.8645
Epoch 10/50
 - 49s - loss: 0.4470 - acc: 0.8618
Epoch 11/50
 - 49s - loss: 0.4470 - acc: 0.8629
Epoch 12/50
 - 49s - loss: 0.4474 - acc: 0.8630
Epoch 13/50
 - 46s - loss: 0.4459 - acc: 0.8623
Epoch 14/50
 - 43s - loss: 0.4473 - acc: 0.8633
Epoch 15/50
 - 43s - loss: 0.4476 - acc: 0.8637
Epoch 16/50
 - 55s - loss: 0.4483 - acc: 0.8621
Epoch 17/50
 - 63s - loss: 0.4511 - acc: 0.8633
Epoch 18/50
 - 50s - loss: 0.4476 - acc: 0.8624
Epoch 19/50
 - 48s - loss: 0.4435 - acc: 0.8646
Epoch 20/50
 - 43s - loss: 0.4423 - acc: 0.8638
Epoch 21/50
 - 43s - loss: 0.4439 - acc: 0.8642
E

In [97]:
# compile reverse sequence network
rev_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
rev_model.fit(rev_X, rev_y,batch_size=100, epochs=200, verbose=2)
# save the model to file
rev_model.save('rev_model.h5')

Epoch 1/200
 - 42s - loss: 6.4937 - acc: 0.0552
Epoch 2/200
 - 39s - loss: 5.9839 - acc: 0.0596
Epoch 3/200
 - 39s - loss: 5.8262 - acc: 0.0685
Epoch 4/200
 - 39s - loss: 5.6473 - acc: 0.0919
Epoch 5/200
 - 39s - loss: 5.4272 - acc: 0.1232
Epoch 6/200
 - 39s - loss: 5.2240 - acc: 0.1507
Epoch 7/200
 - 39s - loss: 5.0514 - acc: 0.1676
Epoch 8/200
 - 39s - loss: 4.9001 - acc: 0.1843
Epoch 9/200
 - 39s - loss: 4.7644 - acc: 0.1973
Epoch 10/200
 - 39s - loss: 4.6406 - acc: 0.2081
Epoch 11/200
 - 39s - loss: 4.5254 - acc: 0.2205
Epoch 12/200
 - 39s - loss: 4.4156 - acc: 0.2282
Epoch 13/200
 - 39s - loss: 4.3118 - acc: 0.2380
Epoch 14/200
 - 39s - loss: 4.2127 - acc: 0.2469
Epoch 15/200
 - 39s - loss: 4.1147 - acc: 0.2563
Epoch 16/200
 - 39s - loss: 4.0250 - acc: 0.2616
Epoch 17/200
 - 39s - loss: 3.9346 - acc: 0.2686
Epoch 18/200
 - 39s - loss: 3.8434 - acc: 0.2779
Epoch 19/200
 - 39s - loss: 3.7586 - acc: 0.2845
Epoch 20/200
 - 39s - loss: 3.6728 - acc: 0.2901
Epoch 21/200
 - 39s - loss: 3

In [8]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words

In [9]:
# load the model
model = load_model('model.h5')
rev_model = load_model('rev_model.h5')

#load tokeniser and max_length
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)
    
#loading stopwords to improve relevant word predictions    
stopwords= open('stopwords').read().split()

#load spacy GloVe Model
nlp = spacy.load('en_core_web_md')

In [10]:
#Find and set embeddings for OOV words
def set_embedding_for_oov(doc):
    #checking for oov words and adding embedding
    for token in doc:
        if token.is_oov == True:
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))           

In [11]:
doc = nlp('i livwgffe in london ')
set_embedding_for_oov(doc)

Words predicted from forward sequence model:
went
got
took
Words predicted from reverse sequence model:
live
friends
lives
livwgffe [ 6.57660067e-01  4.76244354e+00 -3.03201938e+00  1.08945918e+00
  2.07294154e+00 -1.72011936e+00 -1.04132855e+00 -4.27341700e+00
  5.12522519e-01  4.75169754e+01 -6.46451092e+00  3.23314905e+00
 -8.20549011e-01 -1.24127305e+00 -2.47136021e+00  6.86255598e+00
 -5.67475557e+00  8.91901302e+00 -5.89139748e+00  1.90541768e+00
 -1.44266951e+00 -2.09927797e+00  3.90016246e+00  3.21577621e+00
  2.87224793e+00  1.38080549e+00  5.58334626e-02  3.88464117e+00
  4.88342571e+00  6.54785514e-01  2.08524793e-01  2.35765755e-01
 -4.38081503e+00  8.63592744e-01 -4.01707745e+00  4.26965284e+00
  9.39078152e-01 -8.16541553e-01 -2.88079095e+00 -3.58328491e-01
 -1.37374198e+00  2.54968500e+00  1.22174752e+00  3.81368446e+00
 -1.36955392e+00  6.57276392e+00  2.76710510e-01 -5.57884979e+00
  2.41122460e+00 -1.17881525e+00 -4.76333523e+00 -2.44222641e+00
 -5.06995010e+00 -2.487

In [13]:
most_similar(nlp('livwgffe'))

['livwgffe',
 'LiVe',
 'LIVE',
 'LIve',
 'live',
 'Live',
 'LivE',
 'LiVE',
 'friends',
 'freinds']

# Analysis

In [12]:
#function to find most similar words
def most_similar(word):
    by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
    return [w.orth_ for w in by_similarity[:10]]

## Test 1

In [6]:
nlp.vocab.get_vector('lndn')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [44]:
test1 = nlp('i live in lndn ')
set_embedding_for_oov(test1)
nlp.vocab.get_vector('lndn')

Words predicted from forward sequence model:
london
china
dresden
paris
Words predicted from reverse sequence model:


array([ 1.8482960e+00, -1.3151566e+00,  5.1632800e+00, -2.5023196e+00,
        1.1026979e+01, -1.1978415e+00,  2.6360888e+00, -7.6715689e+00,
        1.8184205e+00,  1.2371289e+01, -1.8969524e+01, -6.9695215e+00,
       -4.0036454e+00, -3.4339972e+00,  2.2859437e+00,  2.0211749e+00,
       -1.6033973e+00,  2.5036875e+01,  7.0728431e+00,  1.6018537e+00,
        7.1240129e+00,  4.5292950e+00, -3.7383690e-02, -7.1894655e+00,
       -1.3374448e+00, -9.9555075e-01, -3.1635969e+00,  6.5285888e+00,
        3.5050064e-01,  6.0480785e+00,  1.6345665e+00,  4.3460326e+00,
        3.8556526e+00,  1.1056219e+01, -5.3953868e-01,  1.8110361e+00,
        7.4895191e-01, -3.0155444e+00, -2.2909701e+00,  1.6532058e+00,
        1.5836163e+00, -3.9497399e+00, -9.4094789e-01,  5.8741717e+00,
       -3.9443329e-01, -3.0442669e+00,  4.3613777e+00, -7.3590183e+00,
        3.3658335e+00,  2.9368665e+00, -7.5039077e+00, -3.0951777e+00,
       -2.7213774e+00, -6.5578384e+00, -5.4292530e-02, -2.7713127e+00,
      

In [11]:
nlp('lndn').similarity(nlp('London'))

0.9351419538514082

In [12]:
most_similar(nlp('lndn'))

['lndn',
 'marylebone',
 'brighton',
 'BRIGHTON',
 'Covent',
 'holborn',
 'HEATHROW',
 'Clapham',
 'Kensington',
 'HAMMERSMITH']

## Test 2

In [29]:
test2 = nlp('i play fidditch at school')

In [24]:
nlp.vocab.get_vector('fidditch')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [45]:
set_embedding_for_oov(test2)

Words predicted from forward sequence model:
video
basketball
two
months
going
Words predicted from reverse sequence model:
sport
hockey
attendance
race
continued


In [31]:
nlp.vocab.get_vector('fidditch')

array([ 6.95829821e+00,  2.58323145e+00,  4.13022232e+00, -2.82207894e+00,
        1.22050238e+01, -1.76323080e+00,  7.34122372e+00, -2.76011181e+00,
       -1.99373531e+00,  5.98183708e+01, -9.07633114e+00, -4.26140964e-01,
        8.36639786e+00,  6.84363127e+00,  5.71905375e+00,  1.00508451e+00,
        1.64460516e+00,  1.97753239e+01, -4.80817366e+00, -3.64908814e-01,
       -4.94153309e+00, -1.01224051e+01,  1.24436235e+01, -3.83875942e+00,
        6.56430244e-01,  1.11855803e+01, -6.92403507e+00,  2.84960055e+00,
       -3.52828813e+00,  2.85260391e+00,  1.64034700e+00, -1.75572252e+00,
       -1.02143236e-01, -4.52884167e-01,  9.89408135e-01, -3.32059598e+00,
        6.54138470e+00, -8.01946259e+00, -4.17164993e+00,  3.92733335e+00,
       -3.85233045e+00,  7.31532288e+00,  4.90277863e+00,  6.72401762e+00,
       -4.65730286e+00,  1.19491875e+00, -4.31131363e+00,  1.44764194e+01,
        3.01500940e+00,  1.13407791e+00, -3.74219060e+00, -2.01545906e+00,
        8.59033298e+00, -

In [32]:
most_similar(nlp('fidditch'))

['SPORT',
 'sportive',
 'Sportsman',
 'SPORTSMAN',
 'sportsman',
 'Sport',
 'sport',
 'SPORTS',
 'sports',
 'Sports']

In [46]:
nlp('fidditch').similarity(nlp('sport'))

0.845331215478122