In [1]:
import random
import collections

import numpy as np
from scipy import spatial

import tensorflow as tf
tf.__version__

'1.5.0'

In [2]:
#pretrained Glove embeddings path
glove_path = '/Users/srikanth_m07/Documents/ml_dataset/nlp/wordVectors/glove.6B/glove.6B.50d.txt'
glove_vocab = []
glove_embd=[]
embedding_dict = {}

In [3]:
file = open(glove_path,'r',encoding='UTF-8')
for line in file.readlines():
    row = line.strip().split(' ')
    vocab_word = row[0]
    glove_vocab.append(vocab_word)
    embed_vector = [float(i) for i in row[1:]] # convert to list of float
    embedding_dict[vocab_word]=embed_vector
file.close()
  
print('Loaded GLOVE')
 
glove_vocab_size = len(glove_vocab)
embedding_dim = len(embed_vector)
embedding_dim

Loaded GLOVE


50

In [4]:
embedding_dim

50

Here we define a block of text to use as our training data.  We load the data into a numpy array for easy indexing.

In [5]:
fable_text = """
long ago , the mice had a general council to consider what measures
they could take to outwit their common enemy , the cat . some said
this , and some said that but at last a young mouse got up and said
he had a proposal to make , which he thought would meet the case . 
you will all agree , said he , that our chief danger consists in the
sly and treacherous manner in which the enemy approaches us . now , 
if we could receive some signal of her approach , we could easily
escape from her . i venture , therefore , to propose that a small
bell be procured , and attached by a ribbon round the neck of the cat
. by this means we should always know when she was about , and could
easily retire while she was in the neighbourhood . this proposal met
with general applause , until an old mouse got up and said that is
all very well , but who is to bell the cat ? the mice looked at one
another and nobody spoke . then the old mouse said it is easy to
propose impossible remedies .
"""

In [6]:
fable_text = fable_text.replace('\n', '')

In [7]:
fable_text

'long ago , the mice had a general council to consider what measuresthey could take to outwit their common enemy , the cat . some saidthis , and some said that but at last a young mouse got up and saidhe had a proposal to make , which he thought would meet the case . you will all agree , said he , that our chief danger consists in thesly and treacherous manner in which the enemy approaches us . now , if we could receive some signal of her approach , we could easilyescape from her . i venture , therefore , to propose that a smallbell be procured , and attached by a ribbon round the neck of the cat. by this means we should always know when she was about , and couldeasily retire while she was in the neighbourhood . this proposal metwith general applause , until an old mouse got up and said that isall very well , but who is to bell the cat ? the mice looked at oneanother and nobody spoke . then the old mouse said it is easy topropose impossible remedies .'

In [8]:
#this function puts all the words in a single column vector within a numpy array
def read_data(raw_text):
    content = raw_text
    content = content.split() #splits the text by spaces (default split character)
    content = np.array(content) #to array
    content = np.reshape(content, [-1, ])
    return content
 
training_data = read_data(fable_text)

In [9]:
training_data.shape, training_data

((192,),
 array(['long', 'ago', ',', 'the', 'mice', 'had', 'a', 'general', 'council',
        'to', 'consider', 'what', 'measuresthey', 'could', 'take', 'to',
        'outwit', 'their', 'common', 'enemy', ',', 'the', 'cat', '.',
        'some', 'saidthis', ',', 'and', 'some', 'said', 'that', 'but', 'at',
        'last', 'a', 'young', 'mouse', 'got', 'up', 'and', 'saidhe', 'had',
        'a', 'proposal', 'to', 'make', ',', 'which', 'he', 'thought',
        'would', 'meet', 'the', 'case', '.', 'you', 'will', 'all', 'agree',
        ',', 'said', 'he', ',', 'that', 'our', 'chief', 'danger',
        'consists', 'in', 'thesly', 'and', 'treacherous', 'manner', 'in',
        'which', 'the', 'enemy', 'approaches', 'us', '.', 'now', ',', 'if',
        'we', 'could', 'receive', 'some', 'signal', 'of', 'her', 'approach',
        ',', 'we', 'could', 'easilyescape', 'from', 'her', '.', 'i',
        'venture', ',', 'therefore', ',', 'to', 'propose', 'that', 'a',
        'smallbell', 'be', 'procured',

Next we build a dictionary and reverse dictionary that maps each word in the document vocabulary to a unique integer value.  These dictionaries serve a slightly different purpose than the dictionary with our word embeddings, but the two come together as we’ll see in  a moment.

In [10]:
#Create dictionary and reverse dictionary with word ids
def build_dictionaries(words):
    count = collections.Counter(words).most_common() #creates list of word/count pairs;
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) #len(dictionary) increases each iteration
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary
 
word2id, id2word = build_dictionaries(training_data)

Now we can finally create the array of word embeddings that we’ll actually be loading into Tensorflow.  What’s critical to realize at this point is that we probably have lots of pre-trained words in embedding_dict that aren’t in our training data and conversely we may have words in our training data that aren’t included in our pre-trained set.

We solve this by looping through all the words in dictionary.  If the word is one that we already have an embedding for, then we add the embedding to a new object embeddings_tmp.  If the word is not one that we already have an embedding for, then we assign a vector of small random values.  (Don’t worry, when we train our model, we’ll allow Tensorflow to update these randomly assigned values.)

In [23]:
#Create embedding array
 
doc_vocab_size = len(word2id)
dict_as_list = sorted(word2id.items(), key = lambda x : x[1])
 
embeddings_matrix=[]
 
for i in range(doc_vocab_size):
    item = dict_as_list[i][0]
    if item in glove_vocab:
        embeddings_matrix.append(embedding_dict[item])
    else:
        rand_num = np.random.uniform(low=-0.2, high=0.2,size=embedding_dim)
        embeddings_matrix.append(rand_num)

#final embedding array corresponds to dictionary of words in the document
embedding = np.asarray(embeddings_matrix)
 
# create tree so that we can later search for closest vector to prediction
tree = spatial.KDTree(embedding)

Next we set up or RNN model, so this is a perfect time to start talking about tensor shapes.

The most critical step in the code below is where we feed our object x_unstack into the RNN.  So if we can understand what tensor shape we need at that moment and work our way backward, we should have a good understanding of how to shape our inputs.

So let’s look at the shapes of the objects we’ll be using in our code below:

x: (?,3)<br>
embedding_placeholder: (112, 300)<br>
embedded_chars:  (?, 3, 300)<br>
x_unstack: three separate sensors of shape (?,300)<br>
outputs:  (?, 512)<br>

Our input x is a matrix with an undetermined number rows and is three columns wide.

The other input embedding_placeholder has one row per word in our document vocabulary and is 300 columns wide (to match the dimension of our word embeddings).  We initialize matrix W with the values from embedding_placeholder.

Then we use the function tf.nn.embedding_lookup() to look up each of our inputs from x in matrix W resulting in the three-dimensional tensor embedded_chars that has shape (?, 3, 300).  We then unstack unto individual matrices of dimension (?,300) to feed into our RNN.

In [13]:
# model parameters
learning_rate = 0.001
n_input = 3 # this is the number of words that are read at a time
n_hidden = 512

In [19]:
tf.reset_default_graph()
# create input placeholders
x = tf.placeholder(tf.int32, [None, n_input])
y = tf.placeholder(tf.float32, [None, embedding_dim])
embedding_placeholder = tf.placeholder(tf.float32, [doc_vocab_size, embedding_dim])

#RNN output node weights and biases
weights = { 'out': tf.Variable(tf.random_normal([n_hidden, embedding_dim])) }
biases = { 'out': tf.Variable(tf.random_normal([embedding_dim])) }

In [20]:
with tf.name_scope("embedding"):
    W = tf.Variable(tf.constant(0.0, shape=[doc_vocab_size, embedding_dim]), trainable=True, name="W")
    embedding_init = W.assign(embedding_placeholder)
    embedded_chars = tf.nn.embedding_lookup(W,x)

# reshape input data
x_unstack = tf.unstack(embedded_chars, n_input, 1)
 
# create RNN cells
rnn_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(n_hidden),tf.contrib.rnn.BasicLSTMCell(n_hidden)])
outputs, states = tf.contrib.rnn.static_rnn(rnn_cell, x_unstack, dtype=tf.float32)
 
# capture only the last output
pred = tf.matmul(outputs[-1], weights['out']) + biases['out'] 
 
# Create loss function and optimizer
cost = tf.reduce_mean(tf.nn.l2_loss(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [21]:
# Initialize
init=tf.global_variables_initializer()
 
# Launch the graph
sess = tf.Session()
sess.run(init)
sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})
 
step=0
offset = random.randint(0,n_input+1) #random integer between 0 and 3
end_offset = n_input+1 # in our case tihs is 4
acc_total = 0
loss_total = 0
training_iters = 10000
display_step = 500

In [24]:
while step < training_iters:
    
    if offset > (len(training_data) - end_offset):
        offset = random.randint(0, n_input+1)
  # get the integer representations for the input words
    x_integers = [[word2id[str(training_data[i])]] for i in range(offset, offset+n_input)]
    x_integers = np.reshape(np.array(x_integers), [-1, n_input])
  
 # create embedding for target vector 
  
    y_position = offset+n_input
    y_integer = word2id[training_data[y_position]]
    y_embedding = embedding[y_integer,:]
    y_embedding = np.reshape(y_embedding,[1,-1])

  
    _,loss, pred_ = sess.run([optimizer, cost,pred], feed_dict = {x: x_integers, y: y_embedding})
    loss_total += loss
 
 # display output to show progress
  
    if (step+1) % display_step ==0:
        words_in = [str(training_data[i]) for i in range(offset, offset+n_input)] 
        target_word = str(training_data[y_position])
        
        nearest_dist,nearest_idx = tree.query(pred_[0],3)
        nearest_words = [id2word[idx] for idx in nearest_idx]
  
        print("%s - [%s] vs [%s]" % (words_in, target_word, nearest_words))
        print("Average Loss= " + "{:.6f}".format(loss_total/display_step))
    
    loss_total=0
    step +=1
    offset += (n_input+1) 

print ("Finished Optimization")

['by', 'a', 'ribbon'] - [round] vs [['but', '.', 'this']]
Average Loss= 0.040058
['last', 'a', 'young'] - [mouse] vs [['mouse', 'saidhe', 'metwith']]
Average Loss= 0.006195
[',', 'until', 'an'] - [old] vs [['old', 'who', 'young']]
Average Loss= 0.002828
['approaches', 'us', '.'] - [now] vs [['now', 'but', '.']]
Average Loss= 0.002552
['topropose', 'impossible', 'remedies'] - [.] vs [['.', ',', 'and']]
Average Loss= 0.000663
['procured', ',', 'and'] - [attached] vs [['attached', 'saidhe', 'oneanother']]
Average Loss= 0.002070
['to', 'make', ','] - [which] vs [['.', 'but', 'it']]
Average Loss= 0.003199
['said', 'that', 'isall'] - [very] vs [['very', 'but', 'this']]
Average Loss= 0.001956
['which', 'the', 'enemy'] - [approaches] vs [['approaches', 'approach', 'saidthis']]
Average Loss= 0.001970
['said', 'it', 'is'] - [easy] vs [['easy', 'make', 'always']]
Average Loss= 0.000688
[',', 'and', 'attached'] - [by] vs [['by', 'while', '.']]
Average Loss= 0.002442
['but', 'at', 'last'] - [a] vs 

In [None]:
y_position = offset+n_input
y_integer = word2id[training_data[y_position]]
y_embedding = embedding[y_integer,:]
y_embedding = np.reshape(y_embedding,[1,-1])

In [None]:
x_integers