In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm



In [2]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import time
import string
import itertools
import pickle
import tensorflow as tf

from collections import Counter
from itertools import filterfalse
from functools import reduce
from scipy import sparse

import gc

from dask import distributed
from dask.distributed import Client, LocalCluster

[nltk_data] Downloading package punkt to /home/santhosr/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Old Model

In [None]:
from __future__ import division
from collections import Counter, defaultdict
import os
from random import shuffle
import tensorflow as tf


class NotTrainedError(Exception):
    pass

class NotFitToCorpusError(Exception):
    pass

class GloVeModel():
    def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
                 scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05):
        self.embedding_size = embedding_size
        if isinstance(context_size, tuple):
            self.left_context, self.right_context = context_size
        elif isinstance(context_size, int):
            self.left_context = self.right_context = context_size
        else:
            raise ValueError("`context_size` should be an int or a tuple of two ints")
        self.max_vocab_size = max_vocab_size
        self.min_occurrences = min_occurrences
        self.scaling_factor = scaling_factor
        self.cooccurrence_cap = cooccurrence_cap
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.__words = None
        self.__word_to_id = None
        self.__cooccurrence_matrix = None
        self.__embeddings = None

    def fit_to_corpus(self, corpus):
        self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences,
                             self.left_context, self.right_context)
        self.__build_graph()

    def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size):
        word_counts = Counter()
        cooccurrence_counts = defaultdict(float)
        for region in tqdm(corpus):
            word_counts.update(region)
            for l_context, word, r_context in _context_windows(region, left_size, right_size):
                for i, context_word in enumerate(l_context[::-1]):
                    # add (1 / distance from focal word) for this pair
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
                for i, context_word in enumerate(r_context):
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
        if len(cooccurrence_counts) == 0:
            raise ValueError("No coccurrences in corpus. Did you try to reuse a generator?")
        self.__words = [word for word, count in word_counts.most_common(vocab_size)
                        if count >= min_occurrences]
        self.__word_to_id = {word: i for i, word in enumerate(self.__words)}
        self.__cooccurrence_matrix = {
            (self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count
            for words, count in cooccurrence_counts.items()
            if words[0] in self.__word_to_id and words[1] in self.__word_to_id}
        
        
    def fit_to_cmatrix(self, cmatrix):
        
        wordCount = pickle.load(open('wordCount','rb'))
        vocab = wordCount.most_common(self.vocab_size)

        ## Creating the Word-ID dictionaries
        self.__id_to_word = {i:x[0] for i,x in enumerate(vocab)}

        self.__word_to_id = {value:key for key,value in id_to_word.items()}
        
        self.__words = set(self.__word_to_id.keys())
        
        
        

    def __build_graph(self):
        self.__graph = tf.Graph()
        with self.__graph.as_default(), self.__graph.device(_device_for_node):
            count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
                                    name='max_cooccurrence_count')
            scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
                                         name="scaling_factor")

            self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="focal_words")
            self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                  name="context_words")
            self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
                                                       name="cooccurrence_count")

            focal_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="focal_embeddings")
            context_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="context_embeddings")

            focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                       name='focal_biases')
            context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                         name="context_biases")

            focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
            context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
            focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
            context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)

            weighting_factor = tf.minimum(
                1.0,
                tf.pow(
                    tf.div(self.__cooccurrence_count, count_max),
                    scaling_factor))

            embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)

            log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))

            distance_expr = tf.square(tf.add_n([
                embedding_product,
                focal_bias,
                context_bias,
                tf.negative(log_cooccurrences)]))

            single_losses = tf.multiply(weighting_factor, distance_expr)
            self.__total_loss = tf.reduce_sum(single_losses)
            tf.summary.scalar("GloVe_loss", self.__total_loss)
            self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
                self.__total_loss)
            self.__summary = tf.summary.merge_all()

            self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
                                                name="combined_embeddings")

    def train(self, num_epochs, log_dir=None, summary_batch_interval=1000,
              tsne_epoch_interval=None):
        should_write_summaries = log_dir is not None and summary_batch_interval
        should_generate_tsne = log_dir is not None and tsne_epoch_interval
        batches = self.__prepare_batches()
        total_steps = 0
        with tf.Session(graph=self.__graph) as session:
            if should_write_summaries:
                print("Writing TensorBoard summaries to {}".format(log_dir))
                summary_writer = tf.summary.FileWriter(log_dir, graph=session.graph)
            tf.global_variables_initializer().run()
            for epoch in range(num_epochs):
                shuffle(batches)
                for batch_index, batch in enumerate(batches):
                    i_s, j_s, counts = batch
                    if len(counts) != self.batch_size:
                        continue
                    feed_dict = {
                        self.__focal_input: i_s,
                        self.__context_input: j_s,
                        self.__cooccurrence_count: counts}
                    session.run([self.__optimizer], feed_dict=feed_dict)
                    if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0:
                        summary_str = session.run(self.__summary, feed_dict=feed_dict)
                        summary_writer.add_summary(summary_str, total_steps)
                    total_steps += 1
                if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0:
                    current_embeddings = self.__combined_embeddings.eval()
                    output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1))
                    self.generate_tsne(output_path, embeddings=current_embeddings)
            self.__embeddings = self.__combined_embeddings.eval()
            if should_write_summaries:
                summary_writer.close()

    def embedding_for(self, word_str_or_id):
        if isinstance(word_str_or_id, str):
            return self.embeddings[self.__word_to_id[word_str_or_id]]
        elif isinstance(word_str_or_id, int):
            return self.embeddings[word_str_or_id]

    def __prepare_batches(self):
        if self.__cooccurrence_matrix is None:
            raise NotFitToCorpusError(
                "Need to fit model to corpus before preparing training batches.")
        cooccurrences = [(word_ids[0], word_ids[1], count)
                         for word_ids, count in self.__cooccurrence_matrix.items()]
        i_indices, j_indices, counts = zip(*cooccurrences)
        return list(_batchify(self.batch_size, i_indices, j_indices, counts))

    @property
    def vocab_size(self):
        return len(self.__words)

    @property
    def words(self):
        if self.__words is None:
            raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
        return self.__words

    @property
    def embeddings(self):
        if self.__embeddings is None:
            raise NotTrainedError("Need to train model before accessing embeddings")
        return self.__embeddings

    def id_for_word(self, word):
        if self.__word_to_id is None:
            raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
        return self.__word_to_id[word]

    def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
        if embeddings is None:
            embeddings = self.embeddings
        from sklearn.manifold import TSNE
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
        low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
        labels = self.words[:word_count]
        return _plot_with_labels(low_dim_embs, labels, path, size)


def _context_windows(region, left_size, right_size):
    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)


def _window(region, start_index, end_index):
    """
    Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):min(end_index, last_index) + 1]
    return selected_tokens


def _device_for_node(n):
    if n.type == "MatMul":
        return "/gpu:0"
    else:
        return "/cpu:0"


def _batchify(batch_size, *sequences):
    for i in range(0, len(sequences[0]), batch_size):
        yield tuple(sequence[i:i+batch_size] for sequence in sequences)


def _plot_with_labels(low_dim_embs, labels, path, size):
    import matplotlib.pyplot as plt
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    figure = plt.figure(figsize=size)  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right',
                     va='bottom')
    if path is not None:
        figure.savefig(path)
        plt.close(figure)


### New model

In [28]:
from __future__ import division
from collections import Counter, defaultdict
import os
from random import shuffle
import tensorflow as tf


class NotTrainedError(Exception):
    pass

class NotFitToCorpusError(Exception):
    pass

class GloVeModel():
    def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
                 scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05,
                load_context_vecs = None, load_focal_vecs = None):
        
        self.embedding_size = embedding_size
        if isinstance(context_size, tuple):
            self.left_context, self.right_context = context_size
        elif isinstance(context_size, int):
            self.left_context = self.right_context = context_size
        else:
            raise ValueError("`context_size` should be an int or a tuple of two ints")
        self.max_vocab_size = max_vocab_size
        self.min_occurrences = min_occurrences
        self.scaling_factor = scaling_factor
        self.cooccurrence_cap = cooccurrence_cap
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.__words = None
        self.__word_to_id = None
        self.__cmatrix = None
        self.__num_pairs = None
        
        
        self.__embeddings = None
        self.__focal_embeddings = None
        self.__context_embeddings = None
        self.__focal = None
        self.__context= None
        
        self.__load_context_vecs = load_context_vecs
        self.__load_focal_vecs = load_focal_vecs
        
        
        
    def fit_to_cmatrix(self, cmatrix, wordCountFile):
        """
        Fits a pre-build Cooccurence matrix to the model
        """
        print("In here")
        wordCount = pickle.load(open(wordCountFile,'rb'))
        vocab = wordCount.most_common(self.max_vocab_size)

        ## Creating the Word-ID dictionaries
        self.__id_to_word = {i:x[0] for i,x in enumerate(vocab)}

        self.__word_to_id = {value:key for key,value in self.__id_to_word.items()}
        
        self.__words = set(self.__word_to_id.keys())
        
        self.__cmatrix = cmatrix.tocoo()
        
        self.__num_pairs = len(self.__cmatrix.row)
        
        self.__build_graph()
        
           

    def __build_graph(self):
        self.__graph = tf.Graph()
        with self.__graph.as_default(), self.__graph.device("/device:GPU:0"):
            count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
                                    name='max_cooccurrence_count')
            scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
                                         name="scaling_factor")

            self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="focal_words")
            self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                  name="context_words")
            self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
                                                       name="cooccurrence_count")
            
            print("Right here")
            if self.__load_focal_vecs is None:
                focal_embeddings = tf.Variable(
                    tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                    name="focal_embeddings")
            else:
                print("Loading pretrained values")
                focal_embeddings = tf.Variable(self.__load_focal_vecs,name="focal_embeddings")
            
            if self.__load_context_vecs is None:
                context_embeddings = tf.Variable(
                    tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                    name="context_embeddings")
            else:
                print("Loading pretrained values")
                context_embeddings = tf.Variable(self.__load_context_vecs,name="context_embeddings")
                

            focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                       name='focal_biases')
            context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                         name="context_biases")

            focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
            context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
            focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
            context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)

            weighting_factor = tf.minimum(
                1.0,
                tf.pow(
                    tf.div(self.__cooccurrence_count, count_max),
                    scaling_factor))

            embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)

            log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))

            distance_expr = tf.square(tf.add_n([
                embedding_product,
                focal_bias,
                context_bias,
                tf.negative(log_cooccurrences)]))

            single_losses = tf.multiply(weighting_factor, distance_expr)
            self.__total_loss = tf.reduce_sum(single_losses)
            tf.summary.scalar("GloVe_loss", self.__total_loss)
            self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
                self.__total_loss)
            self.__summary = tf.summary.merge_all()

            self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
                                                name="combined_embeddings")
            
            self.__focal_embeddings = focal_embeddings
            self.__context_embeddings = context_embeddings

    def train(self, num_steps = 1000, log_dir=None, summary_batch_interval=1000,
              tsne_epoch_interval=None):
       
        
        batches = self.getBatch()
        total_steps = 0
        
        with tf.Session(graph=self.__graph, config = tf.ConfigProto(allow_soft_placement = True)) as session:
            
            tf.global_variables_initializer().run()
            
            for step in range(num_steps):
                batch = self.getBatch()
                
                i_s, j_s, counts = zip(*batch)
                
                feed_dict = {
                        self.__focal_input: i_s,
                        self.__context_input: j_s,
                        self.__cooccurrence_count: counts}
                
                session.run([self.__optimizer], feed_dict=feed_dict)
            
            
                    
                total_steps += 1
                
            self.__embeddings = self.__combined_embeddings.eval()
            self.__focal  = self.__focal_embeddings.eval()
            self.__context = self.__context_embeddings.eval()
          
            
            
    def getBatch(self):
        
        batch = []
        
        for i in range(self.batch_size):
            ind = np.random.randint(self.__num_pairs)
            
            #Shuffling the center and context words because we have stored values only in one direction
            
            if np.random.random()>0.5:
                batch.append( (self.__cmatrix.row[ind], self.__cmatrix.col[ind], self.__cmatrix.data[ind]) )
            else:
                batch.append( (self.__cmatrix.col[ind], self.__cmatrix.row[ind], self.__cmatrix.data[ind]) )
            
            
        return batch
    
    def saveEmbeddings(self,suffix =''):
        
        
        
        pickle.dump(self.__focal, open('focal_embed'+suffix,'wb'))
        pickle.dump(self.__context, open('context_embed'+suffix,'wb'))
            
            
            

    def embedding_for(self, word_str_or_id):
        if isinstance(word_str_or_id, str):
            return self.embeddings[self.__word_to_id[word_str_or_id]]
        elif isinstance(word_str_or_id, int):
            return self.embeddings[word_str_or_id]

    def __prepare_batches(self):
        if self.__cmatrix is None:
            raise NotFitToCorpusError(
                "Need to fit model to corpus before preparing training batches.")
        cooccurrences = [(word_ids[0], word_ids[1], count)
                         for word_ids, count in self.__cmatrix.items()]
        i_indices, j_indices, counts = zip(*cooccurrences)
        return list(_batchify(self.batch_size, i_indices, j_indices, counts))

    @property
    def vocab_size(self):
        return len(self.__words)

    @property
    def words(self):
        if self.__words is None:
            raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
        return self.__words

    @property
    def embeddings(self):
        if self.__embeddings is None:
            raise NotTrainedError("Need to train model before accessing embeddings")
        return self.__embeddings

    def id_for_word(self, word):
        if self.__word_to_id is None:
            raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
        return self.__word_to_id[word]

    def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
        if embeddings is None:
            embeddings = self.embeddings
        from sklearn.manifold import TSNE
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
        low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
        labels = self.words[:word_count]
        return _plot_with_labels(low_dim_embs, labels, path, size)


def _context_windows(region, left_size, right_size):
    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)


def _window(region, start_index, end_index):
    """
    Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):min(end_index, last_index) + 1]
    return selected_tokens


def _device_for_node(n):
    if n.type == "MatMul":
        return "/gpu:0"
    else:
        return "/cpu:0"


def _batchify(batch_size, *sequences):
    for i in range(0, len(sequences[0]), batch_size):
        yield tuple(sequence[i:i+batch_size] for sequence in sequences)


def _plot_with_labels(low_dim_embs, labels, path, size):
    import matplotlib.pyplot as plt
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    figure = plt.figure(figsize=size)  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right',
                     va='bottom')
    if path is not None:
        figure.savefig(path)
        plt.close(figure)


In [18]:
np.random.random()

0.9585257661966555

In [5]:
a

<1000000x1000000 sparse matrix of type '<class 'numpy.float64'>'
	with 124274397 stored elements in Compressed Sparse Row format>

In [3]:
a = pickle.load(open('cooccurMat_0','rb'))

In [6]:
for i in range(1,10):
    b = pickle.load(open('cooccurMat_'+str(i),'rb'))
    a = a+b

In [7]:
a

<1000000x1000000 sparse matrix of type '<class 'numpy.float64'>'
	with 474850148 stored elements in Compressed Sparse Row format>

### Testing

In [29]:
model = GloVeModel(embedding_size=100, context_size=10,max_vocab_size=1000000,load_context_vecs=context_array, load_focal_vecs= focal_array)

In [30]:
model.fit_to_cmatrix(a, 'wordCount')

In here
Right here
Loading pretrained values
Loading pretrained values


In [31]:
model.train(1)

In [32]:
model.saveEmbeddings()

In [134]:
e= pickle.load(open('wordCount','rb'))

In [4]:
context_array = pickle.load(open('context_embed_10','rb'))
focal_array = pickle.load(open('focal_embed_10','rb'))

In [33]:
context_array_new = pickle.load(open('context_embed','rb'))
focal_array_new = pickle.load(open('focal_embed','rb'))

In [36]:
context_array_new[100]

array([-0.42820725,  0.3279405 ,  0.00255706,  0.13728783, -0.15837634,
       -0.2224654 ,  0.02430093,  0.2706435 ,  0.16189925, -0.3963217 ,
        0.02753036, -0.33744398,  0.01478683,  0.04167126, -0.01372743,
       -0.36690328,  0.18163441, -0.14389753,  0.08244283,  0.03969737,
       -0.12260118, -0.03833848,  0.41795307,  0.31592208,  0.43021423,
       -0.0462651 , -0.19884585, -0.2516142 , -0.25694767,  0.13515314,
       -0.2147453 ,  0.42421624,  0.11931733, -0.00648857,  0.03249184,
       -0.44927317, -0.37367916,  0.06678842, -0.19567408,  0.00882525,
       -0.19148587, -0.36658376,  0.1280104 , -0.16169503, -0.07901064,
       -0.1509476 , -0.08124208, -0.0606637 ,  0.07976799, -0.06875487,
       -0.304153  ,  0.24482484,  0.02833407,  0.29307896,  0.20560703,
        0.04874455, -0.15811616,  0.27799016, -0.17931765, -0.1190341 ,
        0.01104508, -0.01776258, -0.25723696, -0.48068726,  0.19116622,
       -0.55141085, -0.08873697,  0.11446474, -0.20272389, -0.11

In [37]:
context_array[100]

array([-0.42820725,  0.3279405 ,  0.00255706,  0.13728783, -0.15837634,
       -0.2224654 ,  0.02430093,  0.2706435 ,  0.16189925, -0.3963217 ,
        0.02753036, -0.33744398,  0.01478683,  0.04167126, -0.01372743,
       -0.36690328,  0.18163441, -0.14389753,  0.08244283,  0.03969737,
       -0.12260118, -0.03833848,  0.41795307,  0.31592208,  0.43021423,
       -0.0462651 , -0.19884585, -0.2516142 , -0.25694767,  0.13515314,
       -0.2147453 ,  0.42421624,  0.11931733, -0.00648857,  0.03249184,
       -0.44927317, -0.37367916,  0.06678842, -0.19567408,  0.00882525,
       -0.19148587, -0.36658376,  0.1280104 , -0.16169503, -0.07901064,
       -0.1509476 , -0.08124208, -0.0606637 ,  0.07976799, -0.06875487,
       -0.304153  ,  0.24482484,  0.02833407,  0.29307896,  0.20560703,
        0.04874455, -0.15811616,  0.27799016, -0.17931765, -0.1190341 ,
        0.01104508, -0.01776258, -0.25723696, -0.48068726,  0.19116622,
       -0.55141085, -0.08873697,  0.11446474, -0.20272389, -0.11