In [0]:
import os;os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2')

In [0]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

from datetime import datetime
from sklearn.utils import shuffle
from utils import find_analogies


import sys

from utils import get_wikipedia_data
from brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

In [0]:
class Glove:
    def __init__(self, D, V, context_sz):
        self.D = D
        self.V = V
        self.context_sz = context_sz

    def fit(self, sentences, cc_matrix=None):
        # build co-occurrence matrix
        # paper calls it X, so we will call it X, instead of calling
        # the training data X
        # TODO: would it be better to use a sparse matrix?

        t0 = datetime.now()
        V = self.V
        D = self.D

        if not os.path.exists(cc_matrix):
            X = np.zeros((V,V))
            N = len(sentences)
            print("Number of sentences to process", N)
            it = 0
            for sentence in sentences:
                it += 1
                if it % 10000 == 0:
                    print('Processed:', it, '/', N)

                n = len(sentence)
                for i in range(n):
                    # i is not the word index!!!
                    # j is not the word index!!!
                    # i just points to which element of the sequence (sentence) we're looking at
                    wi = sentence[i]

                    start = max(0, i - self.context_sz)
                    end = min(n, i + self.context_sz)   
                    # we can either choose only one side as context, or both
                    # here we are doing both

                    # make sure "start" and "end" tokens are part of some context
                    # otherwise their f(X) will be 0 (denominator in bias update)

                    if i - self.context_sz < 0:
                        points = 1.0 / (i + 1)
                        X[wi,0] += points
                        X[0,wi] += points
                    if i + self.context_sz > n:
                        points = 1.0 / (n - i)
                        X[wi,1] += points
                        X[1,wi] += points

                    # left side
                    for j in range(start, i):
                        wj = sentence[j]
                        points = 1.0 / (i - j) # this is +ve
                        X[wi,wj] += points
                        X[wj,wi] += points

                    # right side
                    for j in range(i + 1, end):
                        wj = sentence[j]
                        points = 1.0 / (j - i) # this is +ve
                        X[wi,wj] += points
                        X[wj,wi] += points 

            # save the cc matrix because it takes forever to create
            np.save(cc_matrix, X)
        else:
            X = np.load(cc_matrix)


        print("max in X:", X.max())

        # target
        logX = np.log(X + 1)

        print("max in log(X):", logX.max())

        print("time to build co-occurrence matrix:", (datetime.now() - t0))

        # subtract global mean
        mu = logX.mean()

        model = TruncatedSVD(n_components=D)
        Z = model.fit_transform(logX - mu)
        S = np.diag(model.explained_variance_)
        Sinv = np.linalg.inv(S)
        self.W = Z.dot(Sinv)
        self.U = model.components_.T

        #calculate cost once
        delta = self.W.dot(S).dot(self.U.T) + mu - logX
        cost = (delta * delta).sum()
        print("SVD cost", cost)


    def save(self, fn):
        # function word_analogies expects a (V,D) matrx and a (D,V) matrix
        arrays = [self.W, self.U.T]
        np.savez(fn, *arrays)



In [0]:
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s_svd.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = [] # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king', 'man', 'woman',
                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
                'australia', 'australian', 'december', 'november', 'june',
                'january', 'february', 'march', 'april', 'may', 'july', 'august',
                'september', 'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)
    model.save(we_file)

In [15]:
if __name__ == '__main__':
    we = 'glove_svd_50.npz'
    w2i = 'glove_word2idx_50.json'
    # we = 'glove_svd_brown.npz'
    # w2i = 'glove_word2idx_brown.json'
    main(we, w2i, use_brown=False)
    
    # load back embeddings
    npz = np.load(we)
    W1 = npz['arr_0']
    W2 = npz['arr_1']

    with open(w2i) as f:
        word2idx = json.load(f)
        idx2word = {i:w for w,i in word2idx.items()}

    for concat in (True, False):
        print("** concat:", concat)

        if concat:
            We = np.hstack([W1, W2.T])
        else:
            We = (W1 + W2.T) / 2


        find_analogies('king', 'man', 'woman', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'london', We, word2idx, idx2word)
        find_analogies('france', 'paris', 'rome', We, word2idx, idx2word)
        find_analogies('paris', 'france', 'italy', We, word2idx, idx2word)
        find_analogies('france', 'french', 'english', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
        find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word)
        find_analogies('december', 'november', 'june', We, word2idx, idx2word)

reading: enwiki-20180401-pages-articles1.xml-p10p30302-01.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-02.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-03.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-04.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-05.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-06.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-07.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-08.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-09.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-10.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-11.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-12.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-13.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-14.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-15.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-16.txt
reading: