In [1]:
import itertools
import logging

import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe
from keras.preprocessing.text import Tokenizer

logger = logging.getLogger('word2vec')

Using TensorFlow backend.


In [2]:
tf.enable_eager_execution()

In [3]:
def batch(iterable, n=1):
    '''Utility function for splitting an iterable into an iterable of smaller batches'''
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def pad_sentence(sentence, window_size=1):
    START_TOKEN = '<s> '
    END_TOKEN = ' </s>'
    padded_sentence = (START_TOKEN * window_size) + sentence + (END_TOKEN * window_size)
    return padded_sentence

def cbow_preprocess(sequence, window_size=1):
    '''
    Turns 'the quick brown fox jumped over the lazy dog' into
    [[('the', 'brown'), 'quick'],
     [('quick', 'fox'), 'brown'],
     [('brown', 'jumped'), 'fox'],
     ...
     
     except as NumPy arrays.
    '''
    inputs = []
    labels = []

    for i, token in enumerate(sequence):
        if ((i < window_size) | (i > len(sequence) - (window_size + 1))):
            pass
        else:
            previous_words = tuple(sequence[i-window_size:i])
            next_words = tuple(sequence[i+1:i+window_size+1])
            context = previous_words + next_words
            label = token
            inputs.append(context)
            labels.append(label)

    inputs = np.squeeze(np.array(inputs))
    labels = np.expand_dims(np.array(labels), 1)
    
    return inputs, labels

def skipgram_preprocess(sequence, window_size=1):
    '''
    Turns 'the quick brown fox jumped over the lazy dog' into
    [('quick', 'the'),
     ('quick', 'brown'),
     ('brown', 'quick'),
     ('brown', 'fox'),
     ...
     
     except as NumPy arrays.
    '''
    inputs = []
    labels = []

    for i, token in enumerate(sequence):
        if ((i < window_size) | (i > len(sequence) - (window_size + 1))):
            pass
        else:        
            for input_ in sequence[i-window_size:i]:
                inputs.append(input_)
                labels.append(token)
            for input_ in sequence[i+1:i+window_size+1]:
                inputs.append(input_)
                labels.append(token)
    
    inputs = np.squeeze(np.array(inputs))
    labels = np.expand_dims(np.array(labels), 1)
    
    return inputs, labels

In [4]:
class Word2Vec:
    '''
    ToDo: Write the docs.
    ToDo: Add logging statements.
    '''

    def __init__(self, texts, embedding_dim=100, window_size=2, architecture='skipgram',
                 batch_size=32, num_epochs=1, num_sampled=200):

        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.num_sampled = num_sampled
        self.padded_texts = [pad_sentence(text, window_size) for text in texts]

        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.padded_texts)
        self.tokenized_texts = self.tokenizer.texts_to_sequences(self.padded_texts)
        self.vocab_size = len(self.tokenizer.word_index)

        self._initialize_tensors()

        self.embedding_matrix_numpy = None
        self.embedding_dict = None

        if architecture == 'cbow':
            self.train_op = self._get_skipgram_train_op()
            self.preprocessor = cbow_preprocess
        else:
            self.train_op = self._get_cbow_train_op()
            self.preprocessor = skipgram_preprocess

    def _initialize_tensors(self):
        shape = (self.vocab_size, self.embedding_dim)
        self.embedding_matrix_ = tf.get_variable(name='embedding_matrix', initializer=tf.random_uniform(shape, -1.0, 1.0))
        
        nce_w_init = tf.truncated_normal(shape, stddev=1.0 / (self.vocab_size ** 0.5))
        self.nce_weights_ = tf.get_variable(name='nce_weights', initializer=nce_w_init)
        self.nce_biases_ = tf.get_variable(name='nce_biases', initializer=tf.zeros([self.vocab_size]))
        
        self.train_inputs_ = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.train_labels_ = tf.placeholder(tf.int32, shape=[self.batch_size, 1])

    def _concat_train_data(self, preprocessed_texts):
        if self.preprocessor == skipgram_preprocess:
            contexts = np.hstack(t[0] for t in preprocessed_texts)
        else:
            contexts = np.vstack(t[0] for t in preprocessed_texts)
        labels = np.vstack(t[1] for t in preprocessed_texts)

        return contexts, labels

    def _get_skipgram_train_op(self):
        '''Graph for the skipgram Word2Vec model.'''

        embeddings_ = tf.nn.embedding_lookup(self.embedding_matrix_, self.train_inputs_)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=self.nce_weights_,
                biases=self.nce_biases_,
                labels=self.train_labels_,
                inputs=embeddings_,
                num_sampled=self.num_sampled,
                num_classes=self.vocab_size
            )
        )
        train_op = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

        return train_op

    def _get_cbow_train_op(self):
        '''Graph for the cbow Word2Vec model.'''

        embeddings_ = tf.nn.embedding_lookup(self.embedding_matrix_, self.train_inputs_)
        bow = tf.reduce_sum(embeddings_, axis=1)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=self.nce_weights_,
                biases=self.nce_biases_,
                labels=self.train_labels_,
                inputs=embeddings_,
                num_sampled=self.num_sampled,
                num_classes=self.vocab_size
            )
        )
        train_op = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

        return train_op

    def train(self, new_text=None):
        '''Trains the Word2Vec embeddings.'''

        # ToDo: Handle new_text
        # Ensure that a list of sentences still results in the expected
        # output.
        preprocessed_texts = [self.preprocessor(text) for text in self.tokenized_texts]
        train_inputs, train_labels = self._concat_train_data(preprocessed_texts)

        with tf.Session() as sess:
            for i in range(self.num_epochs):
                for inputs, labels in zip(batch(train_inputs, self.batch_size), batch(train_labels, self.batch_size)):
                    feed_dict = {
                        self.train_inputs_: inputs,
                        self.train_labels_: labels
                    }
                    _ = sess.run(self.train_op, feed_dict=feed_dict)

        logger.info('Training finished.')
        self.embedding_matrix_numpy = sess.eval(self.embedding_matrix_)

    def _build_embedding_dict(self):
        word_to_id = self.tokenizer.word_index
        id_to_word = dict(zip(word_to_id.values(), word_to_id.keys()))

        embedding_dict = {}
        for i in range(self.embedding_matrix_numpy.shape[0]):
            word = id_to_word[i]
            embedding_dict[word] = self.embedding_matrix_numpy[i, :]

        return embedding_dict

    @property
    def embeddings(self):
        if not self.embedding_dict:
            self.embedding_dict = self._build_embedding_dict()

        return self.embedding_dict

In [5]:
texts = [
    'the quick brown fox jumped over the lazy dog',
    'the slow brown fox snuck under the active dog'
]

In [6]:
model = Word2Vec(texts=texts, embedding_dim=10, num_sampled=3)

In [None]:
model.nce_biases_

In [7]:
model.train()

FailedPreconditionError: Attempting to use uninitialized value nce_biases
	 [[Node: nce_biases/read = Identity[T=DT_FLOAT, _class=["loc:@GradientDescent/update_nce_biases/ScatterSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_biases)]]

Caused by op 'nce_biases/read', defined at:
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-571484cffff1>", line 1, in <module>
    model = Word2Vec(texts=texts, embedding_dim=10, num_sampled=3)
  File "<ipython-input-4-d79a9ff061d4>", line 22, in __init__
    self._initialize_tensors()
  File "<ipython-input-4-d79a9ff061d4>", line 40, in _initialize_tensors
    self.nce_biases_ = tf.get_variable(name='nce_biases', initializer=tf.zeros([self.vocab_size]))
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1317, in get_variable
    constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1079, in get_variable
    constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 425, in get_variable
    constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 394, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 786, in _get_single_variable
    use_resource=use_resource)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2220, in variable
    use_resource=use_resource)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2210, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2193, in default_variable_creator
    constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variables.py", line 235, in __init__
    constraint=constraint)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\variables.py", line 397, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\array_ops.py", line 142, in identity
    return gen_array_ops.identity(input, name=name)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 3795, in identity
    "Identity", input=input, name=name)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\framework\ops.py", line 3392, in create_op
    op_def=op_def)
  File "C:\Users\rmdelgad\Anaconda3\envs\datascience\lib\site-packages\tensorflow\python\framework\ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value nce_biases
	 [[Node: nce_biases/read = Identity[T=DT_FLOAT, _class=["loc:@GradientDescent/update_nce_biases/ScatterSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_biases)]]


In [None]:
texts = [
    'the quick brown fox jumped over the lazy dog',
    'the slow brown fox snuck under the active dog'
]

embedding_dim=100
window_size=2
architecture='skipgram'
num_epochs=5
num_sampled=4
batch_size=2

skipgram

In [None]:
padded_texts = [pad_sentence(text, window_size) for text in texts]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(padded_texts)
tokenized_texts = tokenizer.texts_to_sequences(padded_texts)
vocab_size = len(tokenizer.word_index)

preprocessor = skipgram_preprocess

inputs = []
labels = []

for text in tokenized_texts:
    sequence_contexts, sequence_labels = preprocessor(text)
    inputs.append(sequence_contexts)
    labels.append(sequence_labels)

if inputs[0].ndim == 1:
    inputs = np.hstack(inputs)
else:
    inputs = np.vstack(inputs)
    
labels = np.vstack(labels)

embeddings_ = tf.nn.embedding_lookup(embedding_matrix_, inputs_batch)

loss = tf.reduce_mean(
    tf.nn.nce_loss(
        weights=nce_weights_,
        biases=nce_biases_,
        labels=labels_,
        inputs=embeddings_,
        num_sampled=num_sampled,
        num_classes=vocab_size
   )
)

cbow

In [None]:
padded_texts = [pad_sentence(text, window_size) for text in texts]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(padded_texts)
tokenized_texts = tokenizer.texts_to_sequences(padded_texts)
vocab_size = len(tokenizer.word_index)

preprocessor = cbow_preprocess

inputs = []
labels = []

for text in tokenized_texts:
    sequence_contexts, sequence_labels = preprocessor(text)
    inputs.append(sequence_contexts)
    labels.append(sequence_labels)

if inputs[0].ndim == 1:
    inputs = np.hstack(inputs)
else:
    inputs = np.vstack(inputs)
    
labels = np.vstack(labels)

embeddings_ = tf.nn.embedding_lookup(embedding_matrix_, inputs_batch)
bow = tf.reduce_sum(embeddings_, axis=1)
loss = tf.reduce_mean(
    tf.nn.nce_loss(
        weights=nce_weights_,
        biases=nce_biases_,
        labels=labels_,
        inputs=embeddings_,
        num_sampled=num_sampled,
        num_classes=vocab_size
   )
)

In [None]:
preprocessor = skipgram_preprocess

preprocessed_texts = [preprocessor(text) for text in tokenized_texts]
inputs_labels = list(itertools.chain.from_iterable(preprocessed_texts))

with tf.Session() as sess:
    initializer = tf.global_variables_initializer()
    sess.run(initializer)
    
    for i in range(num_epochs):
        print(f'Running epoch {i}')
        for inputs, labels in batch(inputs_labels, batch_size):
            labels_arr = np.reshape(np.array(labels), (batch_size, -1))
            inputs_arr = 
            feed_dict = {
                train_inputs_: inputs,
                train_labels_: labels_arr
            }
            _ = sess.run(train_op, feed_dict=feed_dict)

In [None]:
def _concat_train_data(preprocessed_texts):
    if preprocessor == skipgram_preprocess:            
        contexts = np.hstack(t[0] for t in preprocessed_texts)
    else:
        contexts = np.vstack(t[0] for t in preprocessed_texts)
    labels = np.vstack(t[1] for t in preprocessed_texts)

    return contexts, labels

In [None]:
padded_texts = [pad_sentence(text, window_size) for text in texts]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(padded_texts)
tokenized_texts = tokenizer.texts_to_sequences(padded_texts)
vocab_size = len(tokenizer.word_index)

preprocessor = cbow_preprocess
preprocessed_texts = [preprocessor(t) for t in tokenized_texts]

contexts, labels = _concat_train_data(preprocessed_texts)

In [None]:
print(contexts.shape)
print(labels.shape)