In [10]:
from __future__ import division
from collections import Counter, defaultdict
import os
from random import shuffle
import tensorflow as tf

In [107]:
import re

num_sentences = 2000

def load_sst_data(path):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[0:]
            data.append(example)
            if i > num_sentences:
                break
    return data
     
training_set = load_sst_data('/scratch/qx344/data5/training-data/training-data.1m')




def readEvalVocab(path):
    Vocab = []
    with open(path) as f:
        for i, line in enumerate(f):
            Vocab.append(line[:-2])
            #print(line[:-1])
    return Vocab
evalVocab = readEvalVocab('/scratch/qx344/data5/training-data/evalVocab.txt')

In [27]:
a = training_set[0]['text']
a = a.split()

In [29]:
left_size = 4
right_size = 4
word_counts = Counter()
for regionText in training_set[:1]:
    region = regionText['text'].split()
    word_counts.update(region)
    for l_context, word, r_context in _context_windows(region, left_size, right_size):
        print l_context, word, r_context

[] The ['U.S.', 'Centers', 'for', 'Disease']
['The'] U.S. ['Centers', 'for', 'Disease', 'Control']
['The', 'U.S.'] Centers ['for', 'Disease', 'Control', 'and']
['The', 'U.S.', 'Centers'] for ['Disease', 'Control', 'and', 'Prevention']
['The', 'U.S.', 'Centers', 'for'] Disease ['Control', 'and', 'Prevention', 'initially']
['U.S.', 'Centers', 'for', 'Disease'] Control ['and', 'Prevention', 'initially', 'advised']
['Centers', 'for', 'Disease', 'Control'] and ['Prevention', 'initially', 'advised', 'school']
['for', 'Disease', 'Control', 'and'] Prevention ['initially', 'advised', 'school', 'systems']
['Disease', 'Control', 'and', 'Prevention'] initially ['advised', 'school', 'systems', 'to']
['Control', 'and', 'Prevention', 'initially'] advised ['school', 'systems', 'to', 'close']
['and', 'Prevention', 'initially', 'advised'] school ['systems', 'to', 'close', 'if']
['Prevention', 'initially', 'advised', 'school'] systems ['to', 'close', 'if', 'outbreaks']
['initially', 'advised', 'school', 

In [28]:
a[0]

'The'

In [119]:
class GloVeModel():
    def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
                 scaling_factor=3.0/4.0, cooccurrence_cap=100, batch_size=512, learning_rate=0.05):
        self.embedding_size = embedding_size
        if isinstance(context_size, tuple):
            self.left_context, self.right_context = context_size
        elif isinstance(context_size, int):
            self.left_context = self.right_context = context_size
        else:
            raise ValueError("`context_size` should be an int or a tuple of two ints")
        self.max_vocab_size = max_vocab_size
        self.min_occurrences = min_occurrences
        self.scaling_factor = scaling_factor
        self.cooccurrence_cap = cooccurrence_cap
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.__words = None
        self.__word_to_id = None
        self.__cooccurrence_matrix = None
        self.__embeddings = None

    def fit_to_corpus(self, corpus):
        self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences,
                             self.left_context, self.right_context)
        self.__build_graph()

    def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size):
        word_counts = Counter()
        cooccurrence_counts = defaultdict(float)
        for regionText in corpus:
            region = regionText['text'].strip(",.").split()
            word_counts.update(region)
            for l_context, word, r_context in _context_windows(region, left_size, right_size):
                for i, context_word in enumerate(l_context[::-1]):
                    # add (1 / distance from focal word) for this pair
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
                for i, context_word in enumerate(r_context):
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
        if len(cooccurrence_counts) == 0:
            raise ValueError("No coccurrences in corpus. Did you try to reuse a generator?")
        self.__words = [word for word, count in word_counts.most_common(vocab_size)
                        if count >= min_occurrences] 
        self.__word_to_id = {word: i for i, word in enumerate(self.__words)}
        self.__cooccurrence_matrix = {
            (self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count
            for words, count in cooccurrence_counts.items()
            if words[0] in self.__word_to_id and words[1] in self.__word_to_id}

    def __build_graph(self):
        self.__graph = tf.Graph()
        with self.__graph.as_default(), self.__graph.device(_device_for_node):
            count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
                                    name='max_cooccurrence_count')
            scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
                                         name="scaling_factor")

            self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="focal_words")
            self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                  name="context_words")
            self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
                                                       name="cooccurrence_count")

            focal_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="focal_embeddings")
            context_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="context_embeddings")

            focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                       name='focal_biases')
            context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                         name="context_biases")

            focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
            context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
            focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
            context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)

            weighting_factor = tf.minimum(
                1.0,
                tf.pow(
                    tf.div(self.__cooccurrence_count, count_max),
                    scaling_factor))

            embedding_product = tf.reduce_sum(tf.mul(focal_embedding, context_embedding), 1)

            log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))

            distance_expr = tf.square(tf.add_n([
                embedding_product,
                focal_bias,
                context_bias,
                tf.neg(log_cooccurrences)]))

            single_losses = tf.mul(weighting_factor, distance_expr)
            self.__total_loss = tf.reduce_sum(single_losses)
            tf.scalar_summary("GloVe loss", self.__total_loss)
            self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
                self.__total_loss)
            self.__summary = tf.merge_all_summaries()

            self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
                                                name="combined_embeddings")

    def train(self, num_epochs, log_dir=None, summary_batch_interval=1000,
              tsne_epoch_interval=None):
        should_write_summaries = log_dir is not None and summary_batch_interval
        should_generate_tsne = log_dir is not None and tsne_epoch_interval
        batches = self.__prepare_batches()
        total_steps = 0
        with tf.Session(graph=self.__graph) as session:
            if should_write_summaries:
                summary_writer = tf.train.SummaryWriter(log_dir, graph_def=session.graph_def)
            tf.initialize_all_variables().run()
            for epoch in range(num_epochs):
                shuffle(batches)
                for batch_index, batch in enumerate(batches):
                    i_s, j_s, counts = batch
                    if len(counts) != self.batch_size:
                        continue
                    feed_dict = {
                        self.__focal_input: i_s,
                        self.__context_input: j_s,
                        self.__cooccurrence_count: counts}
                    session.run([self.__optimizer], feed_dict=feed_dict)
                    if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0:
                        summary_str = session.run(self.__summary, feed_dict=feed_dict)
                        summary_writer.add_summary(summary_str, total_steps)
                    total_steps += 1
                if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0:
                    current_embeddings = self.__combined_embeddings.eval()
                    output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1))
                    self.generate_tsne(output_path, embeddings=current_embeddings)
            self.__embeddings = self.__combined_embeddings.eval()
            if should_write_summaries:
                summary_writer.close()

    def embedding_for(self, word_str_or_id):
        if isinstance(word_str_or_id, str):
            return self.embeddings[self.__word_to_id[word_str_or_id]]
        elif isinstance(word_str_or_id, int):
            return self.embeddings[word_str_or_id]

    def tell_id(self,word):
        return self.__word_to_id[word]
    
    
    def containWord(self,word):
        if(word in self.__word_to_id):
            return True
        else:
            return False
        
        
    def __prepare_batches(self):
        if self.__cooccurrence_matrix is None:
            raise NotFitToCorpusError(
                "Need to fit model to corpus before preparing training batches.")
        cooccurrences = [(word_ids[0], word_ids[1], count)
                         for word_ids, count in self.__cooccurrence_matrix.items()]
        i_indices, j_indices, counts = zip(*cooccurrences)
        return list(_batchify(self.batch_size, i_indices, j_indices, counts))

    @property
    def vocab_size(self):
        return len(self.__words)

    @property
    def words(self):
        if self.__words is None:
            raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
        return self.__words

    @property
    def embeddings(self):
        if self.__embeddings is None:
            raise NotTrainedError("Need to train model before accessing embeddings")
        return self.__embeddings

    def id_for_word(self, word):
        if self.__word_to_id is None:
            raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
        return self.__word_to_id[word]

    def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
        if embeddings is None:
            embeddings = self.embeddings
        from sklearn.manifold import TSNE
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
        low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
        labels = self.words[:word_count]
        return _plot_with_labels(low_dim_embs, labels, path, size)
    
    


In [120]:
def _context_windows(region, left_size, right_size):
    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)


def _window(region, start_index, end_index):
    """
    Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):min(end_index, last_index) + 1]
    return selected_tokens


def _device_for_node(n):
    if n.type == "MatMul":
        return "/gpu:0"
    else:
        return "/cpu:0"


def _batchify(batch_size, *sequences):
    for i in xrange(0, len(sequences[0]), batch_size):
        yield tuple(sequence[i:i+batch_size] for sequence in sequences)


def _plot_with_labels(low_dim_embs, labels, path, size):
    import matplotlib.pyplot as plt
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    figure = plt.figure(figsize=size)  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right',
                     va='bottom')
    if path is not None:
        figure.savefig(path)
        plt.close(figure)

In [121]:
model = GloVeModel(50, 4)

In [122]:
model.fit_to_corpus(training_set)

In [123]:
model.train(100)

In [127]:
a = model.embedding_for('virus')

In [129]:
a[0]

0.57340944

In [131]:
import numpy as np
np.set_printoptions(linewidth = 1000000, suppress = True)
fout = open('/home/qx344/gloveEmbed.txt', 'w')
fout.write(str(len(evalVocab)) + ' ' + str(model.embedding_size) + '\n')
for vocab in evalVocab:
    fout.write(vocab)
    fout.write(' ')
    if model.containWord(vocab):
    #if vocab in word_to_index_map:  #embedding_for
        #print np.array_str(model.get_embedding(word_to_index_map[vocab]).reshape(1,-1))[2:-2]
        #fout.write(np.array_str(model.embedding_for(vocab).reshape(1,-1))[2:-2])
        fout.write(model.embedding_for(vocab))
    else:
        for i in range(model.embedding_size):
            fout.write('0.0' + ' ')
    fout.write('\n')   

In [130]:
a

array([ 0.58481336,  0.57340944,  0.31806454, -0.6068185 ,  0.2880601 , -0.09446254,  0.79951704,  1.2294755 ,  0.0762113 , -0.79812741, -0.9644382 , -0.41085178, -1.02306688,  1.18787301, -1.11813414,  0.26721951,  0.47297657,  0.35375893, -1.41823208, -0.79700112, -1.4817158 , -1.15799618,  1.32393038, -0.20639223, -0.05705154,  0.53559887, -0.32373905, -0.74698293,  0.6458782 ,  0.06923535, -0.49338272,  1.14246178,  1.49533963,  0.23554997,  0.3804363 , -0.30130619,  0.35470659,  0.4408406 , -1.07319152,  0.92572415, -0.97303712, -1.02625942,  0.13298917,  0.40031284, -0.36523563,  0.23910843,  0.49115449,  0.84107864,  1.51430702,  0.42416972], dtype=float32)

In [19]:
def readEvalVocab(path):
    Vocab = []
    with open(path) as f:
        for i, line in enumerate(f):
            Vocab.append(line[:-2])
            #print(line[:-1])
    return Vocab
evalVocab = readEvalVocab('/scratch/qx344/data5/training-data/evalVocab.txt')

In [7]:
training_set[0]

{'text': 'The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .\n'}