## Embedding Word into Vectors
- Article that generates the idea: [Neural Word Embedding as Implicit Matrix Factorization](http://u.cs.biu.ac.il/~nlp/wp-content/uploads/Neural-Word-Embeddings-as-Implicit-Matrix-Factorization-NIPS-2014.pdf)
- Text8 used by google's word2vec [text8 data](http://mattmahoney.net/dc/text8.zip)
- [wordsim353](http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/)
- [MEN Test collection](http://clic.cimec.unitn.it/~elia.bruni/MEN)

## Experimental Observations

In [1]:
## load text data
import re
%time corpus = re.findall(r"\w+", open("data/text8").read())
print len(corpus)

CPU times: user 2.85 s, sys: 307 ms, total: 3.16 s
Wall time: 3.16 s
17005207


In [47]:
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import check_array, as_float_array, check_random_state
from sklearn.utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
from sklearn.utils.sparsefuncs import mean_variance_axis

import scipy.sparse as sp
class SymmetricSVD(TruncatedSVD):
    def __init__(self, n_components=2, algorithm="randomized", n_iter=5,
                    random_state=None, tol=0.):
        super(SymmetricSVD, self).__init__(n_components, algorithm, 
                                          n_iter, random_state, tol)
    def fit_transform(self, X, y=None):
        """
        After svd, we have M = U * Sigma * VT, traditional SVD
        return W = U * Sigma as the transformed vectors; here 
        in SymmetricSVD version, it is W = U * sqrt(Sigma) that is returned
        as the transformed vectors.
        In the paper [Neural Word Embedding as Implicit Matrix Factorization], the 
        authors claim that it is not clear why this works better, but it does work
        better than the traditional approach.
        
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data.
        Returns
        -------
        X_new : array, shape (n_samples, n_components)
            Reduced version of X. This will always be a dense array.
        """
        X = as_float_array(X, copy=False)
        random_state = check_random_state(self.random_state)

        # If sparse and not csr or csc, convert to csr
        if sp.issparse(X) and X.getformat() not in ["csr", "csc"]:
            X = X.tocsr()

        if self.algorithm == "arpack":
            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            Sigma = Sigma[::-1]
            U, VT = svd_flip(U[:, ::-1], VT[::-1])

        elif self.algorithm == "randomized":
            k = self.n_components
            n_features = X.shape[1]
            if k >= n_features:
                raise ValueError("n_components must be < n_features;"
                                 " got %d >= %d" % (k, n_features))
            U, Sigma, VT = randomized_svd(X, self.n_components,
                                          n_iter=self.n_iter,
                                          random_state=random_state)
        else:
            raise ValueError("unknown algorithm %r" % self.algorithm)

        self.components_ = VT

        # Calculate explained variance & explained variance ratio
        ## USE SQRT OF SIGMA INSTEAD OF SIGMA ITSELF
        X_transformed = np.dot(U, np.sqrt(np.diag(Sigma)))
        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
        if sp.issparse(X):
            _, full_var = mean_variance_axis(X, axis=0)
            full_var = full_var.sum()
        else:
            full_var = np.var(X, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var
        return X_transformed

In [60]:
from collections import Counter
from scipy import sparse
import numpy as np

class MFWordEmbedder(object):
    """Matrix Factorization Word Embedder
    """
    def __init__(self, min_wf = 120, min_cf = 6, window = 2, nneg = 1., 
                 svdtype="truncated", vec_dim = 100):
        """
        Parameters: 
        min_wf - minimum number of frequence for a word to be modelled
        min_cf - minimum number of frequence for a context (word window) to be modelled
        window - int or list, length(s) for word seq for context
        nneg - float number of negative sample, plays as the offset in the matrix factorization here
        svdtype - string {"symmetric", "truncated"}
        vec_dim - int: dimension of learned word vectors
        """
        self.min_wf = min_wf
        self.min_cf = min_cf
        self.window = window
        self.nneg = nneg
        self.svdtype = svdtype
        self.vec_dim = vec_dim
    
    def build_hash(self, words):
        windows = [self.window] if isinstance(self.window, int) else sorted(self.window)
        word_hash, nwords = {}, 0
        context_hash, ncontexts = {}, 0
        # row/col for matrix - only words >= min_wf and contexts >= min_cf will get in
        wordhash2row, nrows = {}, 0 
        contexthash2col, ncols = {}, 0
        ## (wordhash, contexthash) pairs
        wh_ch_pairs = []
        
        for iw, window in enumerate(windows):
            for i in xrange(window, len(words)-window):
                word, context = words[i], tuple(words[i-window:i]+words[i+1:i+window+1])
                ## only update word hash for the first window scanning
                if (iw == 0): 
                    if word in word_hash:
                        h, n = word_hash[word]
                        word_hash[word] = (h, n+1)
                    else:
                        word_hash[word] = (nwords, 1)
                        nwords += 1
                    ## update rows if word occure frequent enought
                    if word_hash[word][1] == self.min_wf: 
                        wordhash2row[word_hash[word][0]] = nrows
                        nrows += 1
                ## update context hash
                if context in context_hash:
                    h, n = context_hash[context]
                    context_hash[context] = (h, n+1)
                else:
                    context_hash[context] = (ncontexts, 1)
                    ncontexts += 1
                ## update cols if context occure frequently enough
                if context_hash[context][1] == self.min_cf:
                    contexthash2col[context_hash[context][0]] = ncols
                    ncols += 1
                ## update wordhash, contexthash pairs
                wh_ch_pairs.append( (word_hash[word][0], context_hash[context][0]) )
#         wh_ch_pairs = np.array([(wid, cid) for wid, cid in wh_ch_pairs if wid in wordhash2row
#                                                                     if cid in contexthash2col])
        ## row to word ????
        return word_hash, context_hash, wordhash2row, contexthash2col, wh_ch_pairs
    
    def build_matrix(self, wordhash2row, contexthash2col, wh_ch_pairs):
        wordids, contextids = zip(*wh_ch_pairs)
        widcounter = Counter(wordids)
        cidcounter = Counter(contextids)
        npairs = len(wh_ch_pairs) * 1.
        nrows, ncols = len(wordhash2row), len(contexthash2col)
        
#         data = np.array([npairs / widcounter[wid] / cidcounter[cid] for wid, cid in wh_ch_pairs])
#         rows = np.array([wordhash2row[wid] for wid in wordids])
#         cols = np.array([contexthash2col[cid] for cid in contextids])

        data, rows, cols = [], [], []
        for wid, cid in wh_ch_pairs:
            if wid in wordhash2row and cid in contexthash2col:
                data.append( npairs / widcounter[wid] / cidcounter[cid] )
                rows.append(wordhash2row[wid])
                cols.append(contexthash2col[cid])
        data, rows, cols = np.array(data), np.array(rows), np.array(cols)
        M = sparse.coo_matrix( (data, (rows, cols)), shape = (nrows, ncols), dtype=np.float32 )
        M.data = np.log(M.data)
        M = M.tocsr()
        M[M<0.0] = 0.0
        return M
    
    def svd(self, M, *args, **kwargs):
        svd = (TruncatedSVD(n_components=self.vec_dim, *args, **kwargs) 
               if self.svdtype == "truncated" 
               else SymmetricSVD(n_components=self.vec_dim, *args, **kwargs))
        self.word_vectors = svd.fit_transform(M)

In [61]:
model = MFWordEmbedder(min_wf=2, min_cf=1, window=2, vec_dim=2)
a = corpus[:10] * 2
print a
word_hash, context_hash, wordhash2row, contexthash2col, wh_ch_pairs = model.build_hash(a)
M = model.build_matrix(wordhash2row, contexthash2col, wh_ch_pairs)
model.svd(M)

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [62]:
print word_hash
print context_hash
print wordhash2row
print contexthash2col
print wh_ch_pairs

{'a': (1, 2), 'term': (2, 2), 'used': (6, 1), 'anarchism': (8, 1), 'originated': (9, 1), 'of': (3, 2), 'against': (7, 1), 'as': (0, 2), 'abuse': (4, 2), 'first': (5, 2)}
{('a', 'term', 'abuse', 'first'): (3, 2), ('term', 'of', 'first', 'used'): (4, 2), ('originated', 'as', 'term', 'of'): (1, 2), ('anarchism', 'originated', 'a', 'term'): (0, 2), ('abuse', 'first', 'against', 'anarchism'): (6, 1), ('as', 'a', 'of', 'abuse'): (2, 2), ('of', 'abuse', 'used', 'against'): (5, 2), ('against', 'anarchism', 'as', 'a'): (9, 1), ('first', 'used', 'anarchism', 'originated'): (7, 1), ('used', 'against', 'originated', 'as'): (8, 1)}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]


In [63]:
model = MFWordEmbedder()
a = corpus
%time word_hash, context_hash, wordhash2row, contexthash2col, wh_ch_pairs = model.build_hash(a)
%time M = model.build_matrix(wordhash2row, contexthash2col, wh_ch_pairs)
%time model.svd(M)

CPU times: user 30.4 s, sys: 1.47 s, total: 31.8 s
Wall time: 31.8 s
CPU times: user 31.3 s, sys: 1.11 s, total: 32.4 s
Wall time: 32.5 s
CPU times: user 15.1 s, sys: 7.06 s, total: 22.1 s
Wall time: 1.64 s
