In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm



In [1]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import time
import string
import itertools
import pickle

from collections import Counter
from itertools import filterfalse
from functools import reduce
from scipy import sparse

import gc

from dask import distributed
from dask.distributed import Client, LocalCluster

[nltk_data] Downloading package punkt to /home/santhosr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### GLoVe Code

In [2]:
from __future__ import division
from collections import Counter, defaultdict
import os
from random import shuffle
import tensorflow as tf


class NotTrainedError(Exception):
    pass

class NotFitToCorpusError(Exception):
    pass

class GloVeModel():
    def __init__(self, embedding_size, context_size, max_vocab_size=100000, min_occurrences=1,
                 scaling_factor=3/4, cooccurrence_cap=100, batch_size=512, learning_rate=0.05):
        self.embedding_size = embedding_size
        if isinstance(context_size, tuple):
            self.left_context, self.right_context = context_size
        elif isinstance(context_size, int):
            self.left_context = self.right_context = context_size
        else:
            raise ValueError("`context_size` should be an int or a tuple of two ints")
        self.max_vocab_size = max_vocab_size
        self.min_occurrences = min_occurrences
        self.scaling_factor = scaling_factor
        self.cooccurrence_cap = cooccurrence_cap
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.__words = None
        self.__word_to_id = None
        self.__cooccurrence_matrix = None
        self.__embeddings = None

    def fit_to_corpus(self, corpus):
        self.__fit_to_corpus(corpus, self.max_vocab_size, self.min_occurrences,
                             self.left_context, self.right_context)
        self.__build_graph()

    def __fit_to_corpus(self, corpus, vocab_size, min_occurrences, left_size, right_size):
        word_counts = Counter()
        cooccurrence_counts = defaultdict(float)
        for region in tqdm(corpus):
            word_counts.update(region)
            for l_context, word, r_context in _context_windows(region, left_size, right_size):
                for i, context_word in enumerate(l_context[::-1]):
                    # add (1 / distance from focal word) for this pair
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
                for i, context_word in enumerate(r_context):
                    cooccurrence_counts[(word, context_word)] += 1 / (i + 1)
        if len(cooccurrence_counts) == 0:
            raise ValueError("No coccurrences in corpus. Did you try to reuse a generator?")
        self.__words = [word for word, count in word_counts.most_common(vocab_size)
                        if count >= min_occurrences]
        self.__word_to_id = {word: i for i, word in enumerate(self.__words)}
        self.__cooccurrence_matrix = {
            (self.__word_to_id[words[0]], self.__word_to_id[words[1]]): count
            for words, count in cooccurrence_counts.items()
            if words[0] in self.__word_to_id and words[1] in self.__word_to_id}
        
        
    def fit_to_cmatrix(self, cmatrix, vocab):
        

    def __build_graph(self):
        self.__graph = tf.Graph()
        with self.__graph.as_default(), self.__graph.device(_device_for_node):
            count_max = tf.constant([self.cooccurrence_cap], dtype=tf.float32,
                                    name='max_cooccurrence_count')
            scaling_factor = tf.constant([self.scaling_factor], dtype=tf.float32,
                                         name="scaling_factor")

            self.__focal_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="focal_words")
            self.__context_input = tf.placeholder(tf.int32, shape=[self.batch_size],
                                                  name="context_words")
            self.__cooccurrence_count = tf.placeholder(tf.float32, shape=[self.batch_size],
                                                       name="cooccurrence_count")

            focal_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="focal_embeddings")
            context_embeddings = tf.Variable(
                tf.random_uniform([self.vocab_size, self.embedding_size], 1.0, -1.0),
                name="context_embeddings")

            focal_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                       name='focal_biases')
            context_biases = tf.Variable(tf.random_uniform([self.vocab_size], 1.0, -1.0),
                                         name="context_biases")

            focal_embedding = tf.nn.embedding_lookup([focal_embeddings], self.__focal_input)
            context_embedding = tf.nn.embedding_lookup([context_embeddings], self.__context_input)
            focal_bias = tf.nn.embedding_lookup([focal_biases], self.__focal_input)
            context_bias = tf.nn.embedding_lookup([context_biases], self.__context_input)

            weighting_factor = tf.minimum(
                1.0,
                tf.pow(
                    tf.div(self.__cooccurrence_count, count_max),
                    scaling_factor))

            embedding_product = tf.reduce_sum(tf.multiply(focal_embedding, context_embedding), 1)

            log_cooccurrences = tf.log(tf.to_float(self.__cooccurrence_count))

            distance_expr = tf.square(tf.add_n([
                embedding_product,
                focal_bias,
                context_bias,
                tf.negative(log_cooccurrences)]))

            single_losses = tf.multiply(weighting_factor, distance_expr)
            self.__total_loss = tf.reduce_sum(single_losses)
            tf.summary.scalar("GloVe_loss", self.__total_loss)
            self.__optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(
                self.__total_loss)
            self.__summary = tf.summary.merge_all()

            self.__combined_embeddings = tf.add(focal_embeddings, context_embeddings,
                                                name="combined_embeddings")

    def train(self, num_epochs, log_dir=None, summary_batch_interval=1000,
              tsne_epoch_interval=None):
        should_write_summaries = log_dir is not None and summary_batch_interval
        should_generate_tsne = log_dir is not None and tsne_epoch_interval
        batches = self.__prepare_batches()
        total_steps = 0
        with tf.Session(graph=self.__graph) as session:
            if should_write_summaries:
                print("Writing TensorBoard summaries to {}".format(log_dir))
                summary_writer = tf.summary.FileWriter(log_dir, graph=session.graph)
            tf.global_variables_initializer().run()
            for epoch in range(num_epochs):
                shuffle(batches)
                for batch_index, batch in enumerate(batches):
                    i_s, j_s, counts = batch
                    if len(counts) != self.batch_size:
                        continue
                    feed_dict = {
                        self.__focal_input: i_s,
                        self.__context_input: j_s,
                        self.__cooccurrence_count: counts}
                    session.run([self.__optimizer], feed_dict=feed_dict)
                    if should_write_summaries and (total_steps + 1) % summary_batch_interval == 0:
                        summary_str = session.run(self.__summary, feed_dict=feed_dict)
                        summary_writer.add_summary(summary_str, total_steps)
                    total_steps += 1
                if should_generate_tsne and (epoch + 1) % tsne_epoch_interval == 0:
                    current_embeddings = self.__combined_embeddings.eval()
                    output_path = os.path.join(log_dir, "epoch{:03d}.png".format(epoch + 1))
                    self.generate_tsne(output_path, embeddings=current_embeddings)
            self.__embeddings = self.__combined_embeddings.eval()
            if should_write_summaries:
                summary_writer.close()

    def embedding_for(self, word_str_or_id):
        if isinstance(word_str_or_id, str):
            return self.embeddings[self.__word_to_id[word_str_or_id]]
        elif isinstance(word_str_or_id, int):
            return self.embeddings[word_str_or_id]

    def __prepare_batches(self):
        if self.__cooccurrence_matrix is None:
            raise NotFitToCorpusError(
                "Need to fit model to corpus before preparing training batches.")
        cooccurrences = [(word_ids[0], word_ids[1], count)
                         for word_ids, count in self.__cooccurrence_matrix.items()]
        i_indices, j_indices, counts = zip(*cooccurrences)
        return list(_batchify(self.batch_size, i_indices, j_indices, counts))

    @property
    def vocab_size(self):
        return len(self.__words)

    @property
    def words(self):
        if self.__words is None:
            raise NotFitToCorpusError("Need to fit model to corpus before accessing words.")
        return self.__words

    @property
    def embeddings(self):
        if self.__embeddings is None:
            raise NotTrainedError("Need to train model before accessing embeddings")
        return self.__embeddings

    def id_for_word(self, word):
        if self.__word_to_id is None:
            raise NotFitToCorpusError("Need to fit model to corpus before looking up word ids.")
        return self.__word_to_id[word]

    def generate_tsne(self, path=None, size=(100, 100), word_count=1000, embeddings=None):
        if embeddings is None:
            embeddings = self.embeddings
        from sklearn.manifold import TSNE
        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
        low_dim_embs = tsne.fit_transform(embeddings[:word_count, :])
        labels = self.words[:word_count]
        return _plot_with_labels(low_dim_embs, labels, path, size)


def _context_windows(region, left_size, right_size):
    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)


def _window(region, start_index, end_index):
    """
    Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):min(end_index, last_index) + 1]
    return selected_tokens


def _device_for_node(n):
    if n.type == "MatMul":
        return "/gpu:0"
    else:
        return "/cpu:0"


def _batchify(batch_size, *sequences):
    for i in range(0, len(sequences[0]), batch_size):
        yield tuple(sequence[i:i+batch_size] for sequence in sequences)


def _plot_with_labels(low_dim_embs, labels, path, size):
    import matplotlib.pyplot as plt
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    figure = plt.figure(figsize=size)  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right',
                     va='bottom')
    if path is not None:
        figure.savefig(path)
        plt.close(figure)


### Reading Data

In [3]:
f = open('/home/santhosr/Documents/Courses/GloVe/wikiData/WestburyLab.Wikipedia.Corpus.txt','r')

dataLines = []

for i in f:
    dataLines.append(i)

In [4]:
len(dataLinesnes)

30749930

In [18]:
def cleanLine(x):
    
    x = x.replace('\n','')
    
    x = x.lower()
    punctuationString = string.punctuation
    x = x.translate(str.maketrans(' ',' ', punctuationString))
    
    x = word_tokenize(x)
    
    return x

def processBatch(x):
    x = [cleanLine(i) for i in x ]
    
    #Removing all docs with length < 2
    x[:] = [doc for doc in x if len(doc)>=2]
    
    return x

In [20]:
c = LocalCluster()

In [21]:
client = Client(c)

In [22]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:33321  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 67.19 GB


In [23]:
for i in range(10):
    
    a = client.map(processBatch, [dataLines[153740*j: 153740*(j+1)] for j in range(20*i,20*(i+1))] )
    b= client.gather(a)
    b = list(itertools.chain.from_iterable(b))
    
    with open('tokenizedData_'+str(i),'wb') as f:
        pickle.dump(b,f)
    
    client.cancel(a)
    client.cancel(b)
    del a
    del b
    

  (['\n', 'Anarchism.\n', 'Anarchism is a political  ...  to fade.\n'],)
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


In [16]:
len(dataLines)

30749930

In [8]:
b = processBatch(dataLines[30748000:])

In [9]:
with open('tokenizedData_11','wb') as f:
    pickle.dump(b,f)

In [19]:
client.scheduler

<pooled rpc to 'tcp://127.0.0.1:37921'>

####  Cleaning Tokens

In [90]:
def cleanTokenFile(i):
    
    fileName = 'tokenizedData_'+str(i)
    x = pickle.load(open(fileName,'rb'))
    startSize = len(x)
    
    
    #Removing all documents with 1 or less words in them
    x[:] = [doc for doc in x if len(x)>=2]
    
    
    endSize = len(x)
    
    print("Dropped {} items".format(startSize - endSize))
    
    with open('tokenizedData_'+str(i),'wb') as f:
        pickle.dump(x,f)
    
    del x
    
    return 1
    
    

In [3]:
client = Client()

In [95]:
iterations = 1
startTime = time.time()
cpuStart = time.clock()


a = client.map(cleanTokenFile, list(range(1,11)))
b = client.gather(a)

endTime = time.time()
cpuEnd = time.clock()



print("Iterations : {} Avg. Clock time : {} Avg. CPU time : {} Clock time : {}  CPU time : {}"\
      .format(iterations, np.round((endTime - startTime)/iterations,4) \
    , np.round((cpuEnd - cpuStart)/iterations, 4), np.round(endTime - startTime,4), np.round(cpuEnd - cpuStart,4)))

EOFError: Ran out of input



In [94]:
client.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:33811  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 67.19 GB


### Creating Vocab

In [8]:
def getVocab(i):
    
    fileName = 'tokenizedData_'+str(i)
    x = pickle.load(open(fileName,'rb'))
    
    counter = Counter()
    for i in x:
        counter.update(i)
        
    gc.collect()
    
    return counter

In [9]:
client = Client()

# client.restart()

In [10]:
iterations = 1
startTime = time.time()
cpuStart = time.clock()


a = client.map(getVocab, list(range(10)))
b = client.gather(a)


endTime = time.time()
cpuEnd = time.clock()


print("Iterations : {} Avg. Clock time : {} Avg. CPU time : {} Clock time : {}  CPU time : {}".format(iterations, np.round((endTime - startTime)/iterations, 4) , np.round((cpuEnd - cpuStart)/iterations, 4), np.round(endTime - startTime,4), np.round(cpuEnd - cpuStart,4)))

Iterations : 1 Avg. Clock time : 170.975 Avg. CPU time : 14.5834 Clock time : 170.975  CPU time : 14.5834


In [11]:
c = b[0]

for i in range(1,len(b)):
    c.update(b[i])

In [12]:
with open('wordCount','wb') as f:
    pickle.dump(c,f)

In [13]:
import gc

gc.collect()

115

### Modifying Tokens

In [3]:
x = pickle.load(open('tokenizedData_0','rb'))

In [None]:
vocabSize = 100000
wordCount = pickle.load(open('wordCount','rb'))

In [None]:
vocab = wordCount.most_common(vocabSize)

## Creating the Word-ID dictionaries
id_to_word = {i:x[0] for i,x in enumerate(vocab)}

word_to_id = {value:key for key,value in id_to_word.items()}


### Loading Token Data

In [5]:
def modifyToken(x):
    
    for doc in x:






In [9]:
client = Client()

In [10]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:45079  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 67.19 GB


In [None]:
#Broadcasting word_to_id to all workers
_ = client.scatter(word_to_id, broadcast=True)

In [16]:
fileName = 'tokenizedData_'+str(0)
x = pickle.load(open(fileName,'rb'))

In [24]:
corpus = [x[0]]

In [11]:
def createCMatrix(corpus):
    
    windowSize = 10

    cooccurrences = sparse.lil_matrix((vocabSize, vocabSize), dtype=np.float64)

    for doc in corpus:

        for center_index, center_word in enumerate(doc):
            
            if center_word not in word_to_id.keys():
                continue
            
            context = doc[max(0, center_index - windowSize) : center_index]
            contextLen = len(context)
            
            

            for context_index, context_word in enumerate(context):

                dist = contextLen - context_index

                inc = 1.0/float(dist)
                
                if context_word in word_to_id.keys():
                    cooccurrences[word_to_id[center_word] , word_to_id[context_word]] += inc
                    cooccurrences[word_to_id[context_word] , word_to_id[center_word]] += inc
                
    
    return cooccurrences
            
            
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))



In [13]:
corpus = pickle.load(open('tokenizedData_'+str(0),'rb'))

In [24]:
client.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:45079  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 67.19 GB


In [None]:
for i in range(1):
    
#     corpus = pickle.load(open('tokenizedData_'+str(i),'rb'))
    
    matrices = []
    
    
    for sub in tqdm(split(corpus,10)):
        
        print("Start")
    
        a = client.map(createCMatrix, list(split(sub,16)))
        b = client.gather(a)
        
        mat = reduce(lambda x,y : x+y, b)
        
        matrices.append(mat.copy())
        
        client.cancel(a)
        client.cancel(b)
        
        del a
        del b
        
        
        
        
        
    
    

0it [00:00, ?it/s]

Start


1it [12:02, 722.17s/it]

Start


2it [24:03, 722.06s/it]

Start


In [None]:
type(b)



In [None]:
type(b)

In [63]:
e = createCMatrix(corpus,10)

  4%|▍         | 103733/2484555 [11:18<3:50:38, 172.04it/s]

KeyboardInterrupt: 

In [12]:
len(corpus)

2484555

In [45]:
r= client.map(createCMatrix, [ [x[0]] , [x[1]]  ], [10,10])

In [46]:
e= client.gather(r)

In [57]:
list(split([1,5,3,7,8,4,6,5,4],4))

[[1, 5, 3], [7, 8], [4, 6], [5, 4]]

In [50]:
e[0]

<100000x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 1225 stored elements in LInked List format>

In [51]:
e[1]

<100000x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 1258 stored elements in LInked List format>

In [25]:
windowSize = 10

cooccurrences = sparse.lil_matrix((vocabSize, vocabSize), dtype=np.float64)

for doc in corpus:

    for center_index, center_word in enumerate(doc):
        
        context = doc[max(0, center_index - windowSize) : center_index]
        contextLen = len(context)
        
        for context_index, context_word in enumerate(context):
            
            dist = contextLen - context_index
            
            inc = 1.0/float(dist)
            
            cooccurrences[word_to_id[center_word] , word_to_id[context_word]] += inc
            cooccurrences[word_to_id[context_word] , word_to_id[center_word]] += inc
            
            

        





In [28]:
cooccurrences[word_to_id['anarchism'], word_to_id['philosophy']]

0.45

In [22]:
for i,j in enumerate(x[0]):
    print(i,j)


0 anarchism
1 is
2 a
3 political
4 philosophy
5 which
6 considers
7 the
8 state
9 undesirable
10 unnecessary
11 and
12 harmful
13 and
14 instead
15 promotes
16 a
17 stateless
18 society
19 or
20 anarchy
21 it
22 seeks
23 to
24 diminish
25 or
26 even
27 abolish
28 authority
29 in
30 the
31 conduct
32 of
33 human
34 relations
35 anarchists
36 may
37 widely
38 disagree
39 on
40 what
41 additional
42 criteria
43 are
44 required
45 in
46 anarchism
47 the
48 oxford
49 companion
50 to
51 philosophy
52 says
53 there
54 is
55 no
56 single
57 defining
58 position
59 that
60 all
61 anarchists
62 hold
63 and
64 those
65 considered
66 anarchists
67 at
68 best
69 share
70 a
71 certain
72 family
73 resemblance


### Creating Cooccur Matrix using Glove_python


In [None]:
b= pickle.load(open('wikiData/tokenizedData_4','rb'))



In [None]:
## Importing Glove_Cython
import os
currDir = os.getcwd()
os.chdir("/home/santhosr/Documents/Courses/GloVe/glove_cython/Glove_Cython")
from glove_python import *
os.chdir(currDir)

## Importing Dictionary
vocabSize = 1000000

wordCount = pickle.load(open('wordCount','rb'))


vocab = wordCount.most_common(vocabSize)

id_to_word = {i:x[0] for i,x in enumerate(vocab)}

word_to_id = {value:key for key,value in id_to_word.items()}

c = Corpus(word_to_id)


%timeit -n 1 -r 1 c.fit(b, ignore_missing=True)


mat = c.matrix.tocsr()


with open('coo_4','wb') as f:
    pickle.dump(mat,f)

### Loading Cooccur Matrix ( Glove_Python)

In [9]:
a = pickle.load(open('/home/santhosr/Documents/Courses/GloVe/coo_0','rb'))

In [10]:
for i in range(1,10):
    b = pickle.load(open('/home/santhosr/Documents/Courses/GloVe/coo_'+str(i),'rb'))
    
    a = a+b

In [12]:
with open('coo_full','wb') as f:
    pickle.dump(a,f)

In [13]:
## Importing Glove_Cython
import os
currDir = os.getcwd()
os.chdir("/home/santhosr/Documents/Courses/GloVe/glove_cython/Glove_Cython")
from glove_python import *
os.chdir(currDir)

## Importing Dictionary
vocabSize = 1000000

wordCount = pickle.load(open('wordCount','rb'))


vocab = wordCount.most_common(vocabSize)

id_to_word = {i:x[0] for i,x in enumerate(vocab)}

word_to_id = {value:key for key,value in id_to_word.items()}

In [14]:
c = Corpus(word_to_id)

In [15]:
c.matrix = a.tocoo()

In [16]:
glove = Glove(no_components=200, learning_rate=0.05)

In [17]:
glove.add_dictionary(c.dictionary)

In [None]:
glove.fit(c.matrix, epochs=2,
          no_threads=8, verbose=True, wordList = ['board','approved','reuters','food','market','mutual','bond','cash','flow','money','directors'],save_gap=1)


Performing 2 training epochs with 8 threads
Epoch 0


### Loading Cooccur Matrix

In [3]:
a = pickle.load(open('cooccurMat_0','rb'))

In [4]:
b = pickle.load(open('cooccurMat_1','rb'))

In [14]:
a

<1000000x1000000 sparse matrix of type '<class 'numpy.float64'>'
	with 248425878 stored elements in Compressed Sparse Row format>

In [17]:
b

<1000000x1000000 sparse matrix of type '<class 'numpy.float64'>'
	with 246874559 stored elements in Compressed Sparse Row format>

In [5]:
a = a + b

In [None]:
r = a.todok()

In [6]:
a[1,2]

1218940.8619047815

In [13]:
a = [(1,2),(3,4),(5,6),(7,8)]

i,j = zip(*a)

In [14]:
i

(1, 3, 5, 7)

### Training Glove 

In [13]:
glove = GloVeModel(embedding_size=100, context_size = 10)

In [None]:
glove.fit_to_corpus(itemlist)

 31%|███▏      | 962685/3074800 [11:13<23:51, 1475.49it/s] 