### Word2Vec using Pretrain model

In [1]:
import os
import re
import io
import requests
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile
from tensorflow.python.framework import ops
import collections
from tqdm import tqdm

In [9]:
batch_size=32
vocabulary_size=7
embedding_size=300

In [None]:
# Create a text cleaning function
def clean_text(text_string):
    text_string = re.sub(r'([^\s\w]|_|[0-9])+', '', text_string)
    text_string = " ".join(text_string.split())
    text_string = text_string.lower()
    return(text_string)

In [5]:
def tokenizer(text):
    text = [document.lower().replace('\n', '').split() for document in text]
    return text

In [6]:
# sentences = ' '.join(text_data_train)
# words = sentences.split()
words=['man','king','mango','roads','dinner','food','morning']
word = tokenizer(words)
print('Data size', len(words))
 

# get unique words and map to glove set
print('Unique word count', len(set(words))) 

Data size 7
Unique word count 7


In [12]:
word

[['man'], ['king'], ['mango'], ['roads'], ['dinner'], ['food'], ['morning']]

In [50]:
# drop rare words
vocabulary_size = 8


def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

In [56]:
data

[4, 3, 7, 6, 2, 1, 5]

In [52]:
count

[['UNK', 0],
 ('food', 1),
 ('dinner', 1),
 ('king', 1),
 ('man', 1),
 ('morning', 1),
 ('roads', 1),
 ('mango', 1)]

In [53]:
dictionary

{'UNK': 0,
 'dinner': 2,
 'food': 1,
 'king': 3,
 'man': 4,
 'mango': 7,
 'morning': 5,
 'roads': 6}

In [54]:
reverse_dictionary

{0: 'UNK',
 1: 'food',
 2: 'dinner',
 3: 'king',
 4: 'man',
 5: 'morning',
 6: 'roads',
 7: 'mango'}

### Load pre-trained weights

In [3]:
GLOVE_DIR=""
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1917495 word vectors.


In [108]:
# prepare embedding matrix
num_words = vocabulary_size
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in dictionary.items():
    if i >= vocabulary_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        print("{} Words:{} Vectors:{}".format(i,word,embedding_vector))

1 Words:food Vectors:[ -1.15189999e-01   2.26339996e-01  -1.63339991e-02  -1.36480004e-01
   5.18559992e-01  -3.30330014e-01  -3.10200000e+00  -2.93610007e-01
   1.95720002e-01  -2.15800002e-01   1.75170004e-01  -4.19840008e-01
  -1.48739994e-01  -5.91189981e-01   3.92100006e-01   9.40909982e-02
  -2.22489998e-01  -2.79760007e-02   2.22790003e-01  -1.63709998e-01
  -5.52709997e-01  -1.16250001e-01  -2.60520011e-01  -3.65429997e-01
  -8.86009991e-01  -5.44239998e-01  -5.66289984e-02  -7.04500020e-01
  -1.28800005e-01  -4.55449998e-01   1.30380005e-01  -2.63509989e-01
   3.22090000e-01   2.68790007e-01   2.33339995e-01   4.28149998e-01
  -1.40630007e-01  -3.97619992e-01   1.19460002e-01  -5.22119999e-02
   1.75070003e-01  -1.98190004e-01   3.14420015e-01   8.71360004e-02
  -3.34450006e-02   6.65870011e-01   2.73779988e-01   1.28769994e-01
  -4.43940014e-01  -4.14110005e-01  -1.74630001e-01   1.34800002e-01
  -1.83589995e-01   5.66169977e-01   4.91360009e-01  -1.53190002e-01
  -5.39399981

In [109]:
embedding_matrix.shape

(8, 300)

In [91]:
i=1
vocab=[]
for x in count[1:]:
    vocab.append(count[i][0])
    i=i+1

In [92]:
vocab

['food', 'dinner', 'king', 'man', 'morning', 'roads', 'mango']

In [93]:
# Change texts into numeric vectors
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(1)
#fit the vocab from glove
pretrain = vocab_processor.fit(vocab)
#transform inputs
text_processed = np.array(list(vocab_processor.transform(words)))

In [94]:
text_processed

array([[4],
       [3],
       [7],
       [6],
       [2],
       [1],
       [5]])

In [95]:
words

['man', 'king', 'mango', 'roads', 'dinner', 'food', 'morning']

In [60]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=7)
tokenizer.fit_on_texts(words)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7 unique tokens.


Using TensorFlow backend.


In [63]:
word_index

{'dinner': 2,
 'food': 1,
 'king': 3,
 'man': 4,
 'mango': 7,
 'morning': 5,
 'roads': 6}

In [111]:
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):

    # Input data.
    test_dataset = tf.constant(data, dtype=tf.int32)
    # Variables.
    embeddings = tf.Variable(
        tf.random_uniform([num_words, embedding_size], -1.0, 1.0))
    embedding_placeholder = tf.placeholder(tf.float32, [num_words, embedding_size])
    embedding_init = embeddings.assign(embedding_placeholder)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, test_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [120]:
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()
    sess.run(embedding_init, feed_dict={embedding_placeholder: embedding_matrix})
    
    print('Initialized')
    sim = similarity.eval()
    for i in range(6):
        valid_word = dictionary[words[i]]
        top_k = 1 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % reverse_dictionary.get(valid_word)
        for k in range(top_k):
            close_word = reverse_dictionary[nearest[k]]
            log = '%s %s,' % (log, close_word)
        print(log)

Initialized
Nearest to man: king,
Nearest to king: man,
Nearest to mango: food,
Nearest to roads: morning,
Nearest to dinner: food,
Nearest to food: dinner,
