In [191]:
#essential imports
import tensorflow as tf
import  os

In [192]:
#configuration variables
fileLocation        = "./corpus"
vocab_size          = 100000
min_occurence       = 1
scaling_factor      = 3/4.0
cooccurence_cap     = 100
batch_size          = 2
learning_rate       = 0.05
embedding_size      = 10
left_context_size   = 2
right_context_size  = 2
words               = None
word_to_id          = None
#cooccurence_matrix[w1, w2] = float
cooccurence_matrix  = None
embeddings          = None
epoch_loss_print    = 10
epoch_tsne_print    = 10
log_dir             = "./logs"

In [181]:
#utility to read the corpus
def readCorpus(filename):
    for line in open(filename):
        line = line.strip()
        yield line.lower().split()

#get the left context
def get_left_context(region, i, left_size):
    start_index = i-left_size
    if start_index<0:
        start_index=0
    left_context = region[start_index:i]
    left_more = ['null_word']*(left_size-len(left_context))
    left_more.extend(left_context)
    return left_more

#get the right context window
def get_right_context(region, i, right_size):
    end_index = i+right_size+1
    total_region = len(region)
    if end_index>total_region:
        end_index=total_region
    right_context = region[i+1:end_index]
    right_context.extend(['null_word']*(right_size-len(right_context)))
    return right_context

#get the window
def window(region, left_size=3, right_size=3):
    total_region = len(region)
    for i, word in enumerate(region):
        left_context = get_left_context(region, i, left_size)
        right_context = get_right_context(region, i, right_size)
        yield (left_context, word, right_context)

In [182]:
from collections import Counter, defaultdict
def fit_to_corpus(corpus, vocab_size, min_occurences, left_size, right_size):
    words_count = Counter()
    #provides value for non-existent key
    cooccurence_counts = defaultdict(float)
    for region in corpus:
        words_count.update(region)
        #add 1/distance from the position of centralized context word
        for l_context, word, r_context in window(region, left_size, right_size):
            for i, context_word in enumerate(l_context[::-1]):
                cooccurence_counts[(word, context_word)] += 1/(i+1)
            for i, context_word in enumerate(r_context):
                cooccurence_counts[(word, context_word)] += 1/(i+1)
    words = [word for word, count in words_count.most_common(max_vocab_size) if count>=min_occurences]
    word_to_id = {word:i for i, word in enumerate(words)}
    cooccurence_matrix = {
                        (word_to_id[words[0]], word_to_id[words[1]]):count
                        for words, count in cooccurence_counts.items()
                         if words[0] in word_to_id and words[1] in word_to_id
                        }
    return words, word_to_id, cooccurence_matrix

In [194]:
#get the corpus
corpus                                = readCorpus(fileLocation)
#get words, word_to_id and cooccurence matrix by fitting it to corpus
#we have words, wordstoid and possible cooccurence matrix for the words
words, word_to_id, cooccurence_matrix = fit_to_corpus(corpus, max_vocab_size, min_occurence, left_context_size,
                                                      right_context_size)

In [184]:
print(cooccurence_matrix)

{(14, 17): 0.0, (24, 17): 1.0, (13, 4): 0.0, (25, 12): 0.0, (24, 4): 0.0, (22, 6): 1.0, (18, 30): 0.0, (8, 5): 0.0, (5, 8): 0.0, (9, 0): 0.0, (29, 8): 1.0, (11, 5): 1.0, (16, 3): 1.0, (17, 24): 1.0, (11, 22): 0.0, (10, 18): 1.0, (24, 14): 1.0, (7, 28): 1.0, (13, 20): 1.0, (20, 4): 1.0, (23, 25): 0.0, (5, 11): 1.0, (14, 24): 1.0, (29, 11): 0.0, (30, 19): 1.0, (16, 0): 0.0, (17, 20): 0.0, (15, 1): 1.0, (18, 10): 1.0, (21, 15): 0.0, (2, 28): 1.0, (28, 15): 0.0, (14, 26): 1.0, (9, 19): 0.0, (26, 24): 0.0, (0, 27): 1.0, (1, 15): 1.0, (9, 25): 1.0, (7, 2): 0.0, (10, 30): 1.0, (7, 15): 1.0, (20, 17): 0.0, (1, 16): 0.0, (25, 0): 1.0, (3, 27): 0.0, (7, 1): 0.0, (20, 31): 0.0, (10, 19): 0.0, (0, 18): 0.0, (15, 7): 1.0, (2, 7): 0.0, (4, 17): 1.0, (15, 28): 0.0, (19, 12): 1.0, (31, 20): 0.0, (25, 9): 1.0, (26, 14): 1.0, (18, 0): 0.0, (3, 0): 1.0, (3, 21): 0.0, (31, 13): 1.0, (30, 18): 0.0, (12, 9): 1.0, (25, 23): 0.0, (21, 1): 1.0, (18, 27): 1.0, (4, 13): 0.0, (28, 2): 1.0, (27, 10): 0.0, (8, 29):

In [185]:
focal_input         = tf.placeholder(tf.int32, shape=[batch_size], name="focal_words")
context_input       = tf.placeholder(tf.int32, shape=[batch_size], name="context_words")
cooccurence_count   = tf.placeholder(tf.float32, shape=[batch_size], name="cooccurence_count")
# epsilon          
#full embedding size variables
focal_embeddings    = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="focal_embeddings")
context_embeddings  = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="context_embeddings")
focal_biases        = tf.Variable(tf.random_uniform([vocab_size], -1.0, 1.0), name="focal_biases")
context_biases      = tf.Variable(tf.random_uniform([vocab_size], -1.0, 1.0), name="context_biases")
#embeddings lookup
focal_embedding     = tf.nn.embedding_lookup([focal_embeddings], focal_input)
context_embedding   = tf.nn.embedding_lookup([context_embeddings], context_input)
focal_bias          = tf.nn.embedding_lookup([focal_biases], focal_input)
context_bias        = tf.nn.embedding_lookup([context_biases], context_input)
product             = tf.multiply(focal_embedding, context_embedding)
embedding_product   = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)
cooccurence_epsilon = cooccurence_count+1e-10
log_cooccurences    = tf.log(cooccurence_epsilon)
distance_expr       = tf.square(tf.add_n([
                        embedding_product,
                        focal_bias,
                        context_bias,
                        tf.negative(log_cooccurences)]))
count_max           = tf.constant([cooccurence_cap], dtype=tf.float32, name="max_cooccurence_count")
scaling_factor_input      = tf.constant([scaling_factor], dtype=tf.float32, name="scaling_factor")
weighting_factor    = tf.minimum(1.0, tf.pow(tf.div(cooccurence_count, count_max), scaling_factor_input))
single_losses       = tf.multiply(weighting_factor, distance_expr)
total_loss          = tf.reduce_sum(single_losses)
optimizer           = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
combined_embeddings = tf.add(focal_embeddings, context_embeddings, name="combined_embeddings")


In [186]:
def batchify(batch_size, *sequences):
    for i in range(0, len(sequences[0]), batch_size):
        yield tuple(sequence[i:i+batch_size] for sequence in sequences)

def prepare_batches():
    #get cooccurence matrix as list of elements and return each list as batch
    cooccurrences = [(word_ids[0], word_ids[1], count) for word_ids, count in cooccurence_matrix.items()]
    i_indices, j_indices, counts = zip(*cooccurrences)
    return list(batchify(batch_size, i_indices, j_indices, counts))

In [178]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def plot_with_labels(low_dim_embs, labels, path, size):
    figure = plt.figure(figsize=size)  # in inches
    for i, label in enumerate(labels):
        #for each label get its x and y position.
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        #text of annotation, xyposition, place label, coordinate system, 
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right',
                     va='bottom')
    if path is not None:
        figure.savefig(path)
        plt.close(figure)

def generate_tsne(path, size=(10, 10), word_count=1000, embeddings=None):
    #get tsne representation
    tsne         = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    #get the tsne transformation for each embedding
    low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
    #get label to assign for each point in embedding space
    labels       = words[:word_count]
    return plot_with_labels(low_dim_embs, labels, path, size)


In [189]:
def train(num_epochs):
    #get the batches
    total_steps=0
    batches = prepare_batches()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        for epoch in range(num_epochs):
            for batch_index, batch in enumerate(batches):
                i_s, j_s, counts = batch
                feed_dict = {focal_input:i_s, context_input:j_s, cooccurence_count:counts}
                sess.run([optimizer], feed_dict=feed_dict)
                total_steps+=1
            if epoch%epoch_loss_print==0:
                loss = sess.run([total_loss], feed_dict)
                print("Loss is " + str(loss))
            if epoch%epoch_tsne_print==0:
                embeddings      = combined_embeddings.eval()
                outputLocation  = ""
                output_path = os.path.join(log_dir, "epoch{:03d}.jpeg".format(epoch))
                generate_tsne(output_path, embeddings=embeddings)
        embeddings = combined_embeddings.eval()
    return embeddings

In [197]:
embeddings = train(200)
# print(embeddings)

Loss is [0.028760383]
Loss is [0.0058103735]
Loss is [0.0018895346]
Loss is [0.00066513265]
Loss is [0.00023938998]
Loss is [8.7818233e-05]
Loss is [3.2770946e-05]
Loss is [1.2384815e-05]
Loss is [4.7169274e-06]
Loss is [1.8032845e-06]
Loss is [6.8995962e-07]
Loss is [2.636458e-07]
Loss is [1.0046299e-07]
Loss is [3.812718e-08]
Loss is [1.4387706e-08]
Loss is [5.4012039e-09]
Loss is [2.015397e-09]
Loss is [7.4349765e-10]
Loss is [2.7228761e-10]
Loss is [9.733387e-11]


In [179]:
output_path = os.path.join(log_dir, "final_embeddings.jpeg")
generate_tsne(output_path, embeddings=embeddings)
print("Image generated")

Image generated


In [None]:
#understand how the full code works with functions written in matrix
#refactor for printing and the corpus fully built up.

In [None]:
#plotting the t-SNE section